Source code for FAIRLinked.RDFTableConversion.MDS_DF.metadata_manager
import os
import json
from datetime import datetime, timezone
import pandas as pd
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, SKOS
from typing import Optional
#### METADATA OBJECT ####
[docs]
class Metadata:
"""
Manages semantic metadata and synchronization between JSON-LD templates and RDF graphs.
This class acts as a specialized container for experimental metadata. It maintains
a 'source of truth' using an RDFLib Graph to ensure semantic consistency, while
providing a standard dictionary interface for JSON-LD serialization. It also
tracks the success of metadata mapping through matched and unmatched logs.
Attributes:
metadata_temp (dict): The JSON-LD representation of the metadata template,
including @context and @graph.
matched_log (list): A historical record of columns successfully mapped to
ontology terms during the initialization process.
unmatched_log (list): A record of columns that failed to find an automated
match in the reference ontology.
template_graph (rdflib.Graph): The internal RDFLib Graph used for complex
updates, validation, and semantic querying.
MDS (rdflib.Namespace): Namespace for Materials Data Science ontology terms.
QUDT (rdflib.Namespace): Namespace for Quantities, Units, Dimensions, and Types.
UNIT (rdflib.Namespace): Namespace for QUDT unit individuals.
"""
def __init__(self,
metadata_template: dict,
matched_log: Optional[list] = None,
unmatched_log: Optional[list] = None):
"""
Initializes the Metadata manager and parses the template into an RDF graph.
Args:
metadata_template (dict): The initial JSON-LD dictionary structure.
matched_log (list, optional): Pre-existing log of successful matches.
Defaults to an empty list.
unmatched_log (list, optional): Pre-existing log of failed matches.
Defaults to an empty list.
"""
self.metadata_temp = metadata_template
self.matched_log = matched_log if matched_log is not None else []
self.unmatched_log = unmatched_log if unmatched_log is not None else []
self.template_graph = Graph()
self.template_graph.parse(data=json.dumps(metadata_template), format="json-ld")
self.MDS = Namespace("https://cwrusdle.bitbucket.io/mds/")
self.QUDT = Namespace("http://qudt.org/schema/qudt/")
self.UNIT = Namespace("https://qudt.org/vocab/unit/")
self.template_graph.bind("unit", self.UNIT)
self.template_graph.bind("skos", SKOS)
self.template_graph.bind("mds", self.MDS)
self.template_graph.bind("qudt", self.QUDT)
def _normalize_graph_structure(self, data: any) -> dict:
"""
Ensure the serialized JSON-LD always has a '@graph' list.
"""
# Fix 1: Handle if data is already a list (flattened JSON-LD)
if isinstance(data, list):
return {
"@context": self.metadata_temp.get("@context", {}),
"@graph": data
}
# Fix 2: Handle if data is a dict but missing '@graph'
if isinstance(data, dict) and "@graph" not in data:
# Use a safe pop that only runs if data is a dict
context = data.pop("@context") if "@context" in data else {}
node = {k: v for k, v in data.items()}
return {"@context": context, "@graph": [node]}
return data
[docs]
def save_metadata(self, output_path: str, matched_log_path: Optional[str] = None, unmatched_log_path: Optional[str] = None):
"""
Exports the synchronized metadata template and import logs to the file system.
This method performs three primary tasks:
1. Serializes the current JSON-LD metadata template (the source of truth) to a file.
2. Optionally exports a log of all columns successfully matched during initialization.
3. Optionally exports a deduplicated log of columns that were not found in the RDF source.
Args:
output_path (str): File path where the JSON-LD metadata template will be saved.
matched_log_path (str, optional): File path to save the list of successfully
matched columns. If None, no log is created.
unmatched_log_path (str, optional): File path to save the unique list of
columns missing RDF metadata. If None, no log is created.
Note:
This method automatically creates any missing parent directories for the
provided file paths to prevent 'FileNotFoundError'.
Returns:
None
"""
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as f:
json.dump(self.metadata_temp, f, indent=2)
if isinstance(matched_log_path, str):
os.makedirs(os.path.dirname(matched_log_path), exist_ok=True)
# Write matched log
with open(matched_log_path, "w") as f:
f.write("\n".join(self.matched_log))
if isinstance(unmatched_log_path, str):
# Write unmatched log (remove duplicates with set)
os.makedirs(os.path.dirname(unmatched_log_path), exist_ok=True)
with open(unmatched_log_path, "w") as f:
f.write("\n".join(sorted(set(self.unmatched_log))))
##### TEMPLATE MANIPULATION ####
[docs]
def update_bulk(self, metadata_template: dict):
"""
Merges an external metadata template into the current instance.
Iterates through the '@graph' and decides whether to update existing
columns or add new ones.
"""
# 1. Extract the graph list from the JSON-LD structure
# This handles both flattened and framed JSON-LD formats
graph_entries = metadata_template.get('@graph', [])
if not graph_entries:
print("⚠️ No valid metadata entries found in the provided template.")
return
updates_count = 0
adds_count = 0
for entry in graph_entries:
# Identify the column name (the unique key for our mapping)
col_name = entry.get("skos:altLabel")
if not col_name:
continue
# Check if this column already exists in our current graph
existing_subject = self.template_graph.value(predicate=SKOS.altLabel, object=Literal(col_name))
if existing_subject:
# OPTION A: Column exists -> Update specific fields
# We map the JSON keys to the 'field' shorthand used in update_template
field_mapping = {
"skos:definition": "definition",
"qudt:hasUnit": "unit",
"rdf:type": "type",
"@type": "type",
"mds:hasStudyStage": "stage",
"skos:note": "note"
}
for json_key, field_shorthand in field_mapping.items():
value = entry.get(json_key)
if value:
# Handle nested @id structures (like in units)
if isinstance(value, dict) and "@id" in value:
value = value["@id"]
self.update_template(col_name, field_shorthand, str(value))
updates_count += 1
else:
# OPTION B: Column is new -> Add it to the graph
rdf_type = entry.get("@type") or "cco:ont00000958"
unit_info = entry.get("qudt:hasUnit", "unit:UNITLESS")
unit = unit_info["@id"] if isinstance(unit_info, dict) else unit_info
self.add_column_metadata(
col_name=col_name,
rdf_type=str(rdf_type),
unit=str(unit).replace("unit:", ""), # Strip prefix for the function
definition=entry.get("skos:definition", "Definition not available"),
study_stage=entry.get("mds:hasStudyStage", "UNK")
)
adds_count += 1
print(f"📊 Bulk Update Summary: {updates_count} columns updated, {adds_count} columns added.")
[docs]
def update_template(self, col_name: str, field: str, value: str):
"""
Updates both the JSON-LD template and the RDF Graph in lock-step.
This ensures self.metadata_temp is updated while keeping the
internal RDFLib graph in sync for serialization.
"""
# 1. Map shorthands to both JSON keys and RDF Predicates
mapping = {
"definition": ("skos:definition", SKOS.definition),
"unit": ("qudt:hasUnit", self.QUDT.hasUnit),
"type": ("@type", RDF.type),
"stage": ("mds:hasStudyStage", self.MDS.hasStudyStage),
"note": ("skos:note", SKOS.note)
}
if field not in mapping:
print(f"⚠️ Field '{field}' is not recognized.")
return
json_key, predicate = mapping[field]
# --- PART A: Update self.metadata_temp (The JSON source) ---
# This is what you were doing; it works because of Python's object referencing
graph_list = self.metadata_temp.get("@graph", [])
for item in graph_list:
if item.get("skos:altLabel") == col_name:
if field == "unit":
# Standardize to a dict structure for QUDT
item[json_key] = {"@id": f"unit:{value}" if ":" not in value else value}
else:
item[json_key] = value
break
# --- PART B: Update self.template_graph (The RDF source) ---
# This ensures serialize_row and other graph functions stay in sync
subject = self.template_graph.value(predicate=SKOS.altLabel, object=Literal(col_name))
if subject:
# Determine the correct RDF Object type
if field == "unit":
unit_uri = value if ":" in value else f"unit:{value}"
new_obj = self.UNIT[unit_uri.split(":")[1]] if "unit:" in unit_uri else URIRef(unit_uri)
elif field == "type":
new_obj = self.MDS[value] if ":" not in value else URIRef(value)
else:
new_obj = Literal(value)
# Overwrite the triple in the graph
self.template_graph.set((subject, predicate, new_obj))
print(f"✅ Synchronized {field} for '{col_name}'.")
else:
print(f"⚠️ Warning: '{col_name}' updated in JSON but not found in RDF Graph.")
[docs]
def add_column_metadata(self, col_name: str, rdf_type: str, unit: str = "UNITLESS",
definition: str = "No definition provided", study_stage: str = "UNK"):
# 1. Direct JSON check
graph = self.metadata_temp.get("@graph", [])
if any(item.get("skos:altLabel") == col_name for item in graph):
return
# 2. Build a clean Python Dict (No RDFLib objects here)
entry = {
"@id": rdf_type if ":" in rdf_type else f"mds:{rdf_type}",
"@type": rdf_type if ":" in rdf_type else f"mds:{rdf_type}",
"skos:altLabel": col_name,
"skos:definition": definition,
"qudt:hasUnit": {"@id": f"unit:{unit}"},
"prov:generatedAtTime": datetime.now(timezone.utc).isoformat() + "Z",
"mds:hasStudyStage": study_stage
}
# 3. Direct append to the list that serialize_row uses
graph.append(entry)
self.metadata_temp["@graph"] = graph
# Optional: If you use the internal graph for OTHER things,
# parse ONLY this new entry to keep it in sync
self.template_graph.parse(data=json.dumps(entry), format="json-ld")
[docs]
def delete_column_metadata(self, col_name: str):
"""
Removes all metadata associated with a specific column from both
the internal JSON template and the RDF graph.
"""
# 1. Direct JSON Manipulation (The "Clean" way)
# We filter the @graph list directly to avoid RDFLib's serialization noise
original_graph = self.metadata_temp.get("@graph", [])
new_graph = [item for item in original_graph if item.get("skos:altLabel") != col_name]
if len(new_graph) < len(original_graph):
# 2. Update the JSON template used by serialize_row()
self.metadata_temp["@graph"] = new_graph
# 3. Keep the RDF graph in sync
# We find the node and remove its triples so the graph remains accurate
subject = self.template_graph.value(predicate=SKOS.altLabel, object=Literal(col_name))
if subject:
self.template_graph.remove((subject, None, None))
print(f"✅ Successfully deleted metadata for column: '{col_name}'.")
else:
print(f"⚠️ Column '{col_name}' not found in the metadata.")
[docs]
def print_template(self, format: str = "table"):
"""
Prints the current metadata template.
:param format: 'table' for a summarized DataFrame view,
'json' for the raw JSON-LD structure.
"""
if format.lower() == "json":
print(json.dumps(self.metadata_temp, indent=2))
elif format.lower() == "table":
graph_data = self.metadata_temp.get("@graph", [])
if not graph_data:
print("The template graph is currently empty.")
return
# Extracting key info into a list of dicts for the DataFrame
summary_list = []
for item in graph_data:
# Handle qudt:hasUnit which might be a dict, a list, or missing
unit_raw = item.get("qudt:hasUnit", "None")
# If it's a list, take the first entry
if isinstance(unit_raw, list) and len(unit_raw) > 0:
unit_val = unit_raw[0].get("@id", "None") if isinstance(unit_raw[0], dict) else str(unit_raw[0])
# If it's a dict, get the @id
elif isinstance(unit_raw, dict):
unit_val = unit_raw.get("@id", "None")
# Otherwise, just stringify it
else:
unit_val = str(unit_raw)
summary_list.append({
"Label": item.get("skos:altLabel", "N/A"),
"Type": item.get("@type", "N/A"),
"Unit": unit_val,
"Definition": item.get("skos:definition", "")[:50] + "..."
if len(item.get("skos:definition", "")) > 50
else item.get("skos:definition", ""),
"Study Stage": item.get("mds:hasStudyStage", "N/A")
})
df_summary = pd.DataFrame(summary_list)
print("\n--- Metadata Template Summary ---")
# If in a Jupyter Notebook, this renders as a nice HTML table
# If in a terminal, it prints a clean text table
print(df_summary.to_string(index=False))
print("------------------------------------------\n")
else:
print(f"Unknown format '{format}'. Use 'table' or 'json'.")