Source code for FAIRLinked.RDFTableConversion.MDS_DF.metadata_manager

import os
import json
from datetime import datetime, timezone
import pandas as pd
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, SKOS
from typing import Optional

#### METADATA OBJECT ####

[docs] class Metadata: """ Manages semantic metadata and synchronization between JSON-LD templates and RDF graphs. This class acts as a specialized container for experimental metadata. It maintains a 'source of truth' using an RDFLib Graph to ensure semantic consistency, while providing a standard dictionary interface for JSON-LD serialization. It also tracks the success of metadata mapping through matched and unmatched logs. Attributes: metadata_temp (dict): The JSON-LD representation of the metadata template, including @context and @graph. matched_log (list): A historical record of columns successfully mapped to ontology terms during the initialization process. unmatched_log (list): A record of columns that failed to find an automated match in the reference ontology. template_graph (rdflib.Graph): The internal RDFLib Graph used for complex updates, validation, and semantic querying. MDS (rdflib.Namespace): Namespace for Materials Data Science ontology terms. QUDT (rdflib.Namespace): Namespace for Quantities, Units, Dimensions, and Types. UNIT (rdflib.Namespace): Namespace for QUDT unit individuals. """ def __init__(self, metadata_template: dict, matched_log: Optional[list] = None, unmatched_log: Optional[list] = None): """ Initializes the Metadata manager and parses the template into an RDF graph. Args: metadata_template (dict): The initial JSON-LD dictionary structure. matched_log (list, optional): Pre-existing log of successful matches. Defaults to an empty list. unmatched_log (list, optional): Pre-existing log of failed matches. Defaults to an empty list. """ self.metadata_temp = metadata_template self.matched_log = matched_log if matched_log is not None else [] self.unmatched_log = unmatched_log if unmatched_log is not None else [] self.template_graph = Graph() self.template_graph.parse(data=json.dumps(metadata_template), format="json-ld") self.MDS = Namespace("https://cwrusdle.bitbucket.io/mds/") self.QUDT = Namespace("http://qudt.org/schema/qudt/") self.UNIT = Namespace("https://qudt.org/vocab/unit/") self.template_graph.bind("unit", self.UNIT) self.template_graph.bind("skos", SKOS) self.template_graph.bind("mds", self.MDS) self.template_graph.bind("qudt", self.QUDT) def _normalize_graph_structure(self, data: any) -> dict: """ Ensure the serialized JSON-LD always has a '@graph' list. """ # Fix 1: Handle if data is already a list (flattened JSON-LD) if isinstance(data, list): return { "@context": self.metadata_temp.get("@context", {}), "@graph": data } # Fix 2: Handle if data is a dict but missing '@graph' if isinstance(data, dict) and "@graph" not in data: # Use a safe pop that only runs if data is a dict context = data.pop("@context") if "@context" in data else {} node = {k: v for k, v in data.items()} return {"@context": context, "@graph": [node]} return data
[docs] def save_metadata(self, output_path: str, matched_log_path: Optional[str] = None, unmatched_log_path: Optional[str] = None): """ Exports the synchronized metadata template and import logs to the file system. This method performs three primary tasks: 1. Serializes the current JSON-LD metadata template (the source of truth) to a file. 2. Optionally exports a log of all columns successfully matched during initialization. 3. Optionally exports a deduplicated log of columns that were not found in the RDF source. Args: output_path (str): File path where the JSON-LD metadata template will be saved. matched_log_path (str, optional): File path to save the list of successfully matched columns. If None, no log is created. unmatched_log_path (str, optional): File path to save the unique list of columns missing RDF metadata. If None, no log is created. Note: This method automatically creates any missing parent directories for the provided file paths to prevent 'FileNotFoundError'. Returns: None """ os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "w") as f: json.dump(self.metadata_temp, f, indent=2) if isinstance(matched_log_path, str): os.makedirs(os.path.dirname(matched_log_path), exist_ok=True) # Write matched log with open(matched_log_path, "w") as f: f.write("\n".join(self.matched_log)) if isinstance(unmatched_log_path, str): # Write unmatched log (remove duplicates with set) os.makedirs(os.path.dirname(unmatched_log_path), exist_ok=True) with open(unmatched_log_path, "w") as f: f.write("\n".join(sorted(set(self.unmatched_log))))
##### TEMPLATE MANIPULATION ####
[docs] def update_bulk(self, metadata_template: dict): """ Merges an external metadata template into the current instance. Iterates through the '@graph' and decides whether to update existing columns or add new ones. """ # 1. Extract the graph list from the JSON-LD structure # This handles both flattened and framed JSON-LD formats graph_entries = metadata_template.get('@graph', []) if not graph_entries: print("⚠️ No valid metadata entries found in the provided template.") return updates_count = 0 adds_count = 0 for entry in graph_entries: # Identify the column name (the unique key for our mapping) col_name = entry.get("skos:altLabel") if not col_name: continue # Check if this column already exists in our current graph existing_subject = self.template_graph.value(predicate=SKOS.altLabel, object=Literal(col_name)) if existing_subject: # OPTION A: Column exists -> Update specific fields # We map the JSON keys to the 'field' shorthand used in update_template field_mapping = { "skos:definition": "definition", "qudt:hasUnit": "unit", "rdf:type": "type", "@type": "type", "mds:hasStudyStage": "stage", "skos:note": "note" } for json_key, field_shorthand in field_mapping.items(): value = entry.get(json_key) if value: # Handle nested @id structures (like in units) if isinstance(value, dict) and "@id" in value: value = value["@id"] self.update_template(col_name, field_shorthand, str(value)) updates_count += 1 else: # OPTION B: Column is new -> Add it to the graph rdf_type = entry.get("@type") or "cco:ont00000958" unit_info = entry.get("qudt:hasUnit", "unit:UNITLESS") unit = unit_info["@id"] if isinstance(unit_info, dict) else unit_info self.add_column_metadata( col_name=col_name, rdf_type=str(rdf_type), unit=str(unit).replace("unit:", ""), # Strip prefix for the function definition=entry.get("skos:definition", "Definition not available"), study_stage=entry.get("mds:hasStudyStage", "UNK") ) adds_count += 1 print(f"📊 Bulk Update Summary: {updates_count} columns updated, {adds_count} columns added.")
[docs] def update_template(self, col_name: str, field: str, value: str): """ Updates both the JSON-LD template and the RDF Graph in lock-step. This ensures self.metadata_temp is updated while keeping the internal RDFLib graph in sync for serialization. """ # 1. Map shorthands to both JSON keys and RDF Predicates mapping = { "definition": ("skos:definition", SKOS.definition), "unit": ("qudt:hasUnit", self.QUDT.hasUnit), "type": ("@type", RDF.type), "stage": ("mds:hasStudyStage", self.MDS.hasStudyStage), "note": ("skos:note", SKOS.note) } if field not in mapping: print(f"⚠️ Field '{field}' is not recognized.") return json_key, predicate = mapping[field] # --- PART A: Update self.metadata_temp (The JSON source) --- # This is what you were doing; it works because of Python's object referencing graph_list = self.metadata_temp.get("@graph", []) for item in graph_list: if item.get("skos:altLabel") == col_name: if field == "unit": # Standardize to a dict structure for QUDT item[json_key] = {"@id": f"unit:{value}" if ":" not in value else value} else: item[json_key] = value break # --- PART B: Update self.template_graph (The RDF source) --- # This ensures serialize_row and other graph functions stay in sync subject = self.template_graph.value(predicate=SKOS.altLabel, object=Literal(col_name)) if subject: # Determine the correct RDF Object type if field == "unit": unit_uri = value if ":" in value else f"unit:{value}" new_obj = self.UNIT[unit_uri.split(":")[1]] if "unit:" in unit_uri else URIRef(unit_uri) elif field == "type": new_obj = self.MDS[value] if ":" not in value else URIRef(value) else: new_obj = Literal(value) # Overwrite the triple in the graph self.template_graph.set((subject, predicate, new_obj)) print(f"✅ Synchronized {field} for '{col_name}'.") else: print(f"⚠️ Warning: '{col_name}' updated in JSON but not found in RDF Graph.")
[docs] def add_column_metadata(self, col_name: str, rdf_type: str, unit: str = "UNITLESS", definition: str = "No definition provided", study_stage: str = "UNK"): # 1. Direct JSON check graph = self.metadata_temp.get("@graph", []) if any(item.get("skos:altLabel") == col_name for item in graph): return # 2. Build a clean Python Dict (No RDFLib objects here) entry = { "@id": rdf_type if ":" in rdf_type else f"mds:{rdf_type}", "@type": rdf_type if ":" in rdf_type else f"mds:{rdf_type}", "skos:altLabel": col_name, "skos:definition": definition, "qudt:hasUnit": {"@id": f"unit:{unit}"}, "prov:generatedAtTime": datetime.now(timezone.utc).isoformat() + "Z", "mds:hasStudyStage": study_stage } # 3. Direct append to the list that serialize_row uses graph.append(entry) self.metadata_temp["@graph"] = graph # Optional: If you use the internal graph for OTHER things, # parse ONLY this new entry to keep it in sync self.template_graph.parse(data=json.dumps(entry), format="json-ld")
[docs] def delete_column_metadata(self, col_name: str): """ Removes all metadata associated with a specific column from both the internal JSON template and the RDF graph. """ # 1. Direct JSON Manipulation (The "Clean" way) # We filter the @graph list directly to avoid RDFLib's serialization noise original_graph = self.metadata_temp.get("@graph", []) new_graph = [item for item in original_graph if item.get("skos:altLabel") != col_name] if len(new_graph) < len(original_graph): # 2. Update the JSON template used by serialize_row() self.metadata_temp["@graph"] = new_graph # 3. Keep the RDF graph in sync # We find the node and remove its triples so the graph remains accurate subject = self.template_graph.value(predicate=SKOS.altLabel, object=Literal(col_name)) if subject: self.template_graph.remove((subject, None, None)) print(f"✅ Successfully deleted metadata for column: '{col_name}'.") else: print(f"⚠️ Column '{col_name}' not found in the metadata.")
[docs] def print_template(self, format: str = "table"): """ Prints the current metadata template. :param format: 'table' for a summarized DataFrame view, 'json' for the raw JSON-LD structure. """ if format.lower() == "json": print(json.dumps(self.metadata_temp, indent=2)) elif format.lower() == "table": graph_data = self.metadata_temp.get("@graph", []) if not graph_data: print("The template graph is currently empty.") return # Extracting key info into a list of dicts for the DataFrame summary_list = [] for item in graph_data: # Handle qudt:hasUnit which might be a dict, a list, or missing unit_raw = item.get("qudt:hasUnit", "None") # If it's a list, take the first entry if isinstance(unit_raw, list) and len(unit_raw) > 0: unit_val = unit_raw[0].get("@id", "None") if isinstance(unit_raw[0], dict) else str(unit_raw[0]) # If it's a dict, get the @id elif isinstance(unit_raw, dict): unit_val = unit_raw.get("@id", "None") # Otherwise, just stringify it else: unit_val = str(unit_raw) summary_list.append({ "Label": item.get("skos:altLabel", "N/A"), "Type": item.get("@type", "N/A"), "Unit": unit_val, "Definition": item.get("skos:definition", "")[:50] + "..." if len(item.get("skos:definition", "")) > 50 else item.get("skos:definition", ""), "Study Stage": item.get("mds:hasStudyStage", "N/A") }) df_summary = pd.DataFrame(summary_list) print("\n--- Metadata Template Summary ---") # If in a Jupyter Notebook, this renders as a nice HTML table # If in a terminal, it prints a clean text table print(df_summary.to_string(index=False)) print("------------------------------------------\n") else: print(f"Unknown format '{format}'. Use 'table' or 'json'.")