import os
import pandas as pd
from rdflib import Graph, RDFS
from rdflib.namespace import DCTERMS, DC, SKOS
from fuzzysearch import find_near_matches
import FAIRLinked.InterfaceMDS.load_mds_ontology
from FAIRLinked.InterfaceMDS.load_mds_ontology import load_mds_ontology_graph
[docs]
def fuzzy_filter_subjects_strict(df, keywords, column="label", max_l_dist=1):
"""
Perform strict fuzzy word-level matching using Levenshtein distance.
Args:
df (pd.DataFrame): DataFrame to filter.
keywords (list of str): Keywords to search for.
column (str): Column to search.
max_l_dist (int): Max Levenshtein distance.
Returns:
pd.DataFrame: Filtered DataFrame of matches.
"""
matches = []
keywords = [kw.lower() for kw in keywords]
for _, row in df.iterrows():
label = str(row[column]).lower()
words = set(label.replace("-", " ").replace("_", " ").split())
for word in words:
for keyword in keywords:
if find_near_matches(keyword, word, max_l_dist=max_l_dist):
matches.append(row)
break
else:
continue
break
return pd.DataFrame(matches)
# NOTE: This main() function is only for interactive CLI use
[docs]
def main():
"""
Example usage for local testing. Not included in the package version.
"""
file_path = input("Enter path to RDF (.ttl) file: ").strip()
if not os.path.exists(file_path):
print(f"❌ File not found: {file_path}")
return
graph = load_mds_ontology_graph()
df = extract_subject_details(graph)
output_csv = file_path.replace(".ttl", "_subjects.csv")
df.to_csv(output_csv, index=False)
print(f"\n📁 Full output saved to: {output_csv}")
keywords_input = input("🔍 Enter fuzzy keywords (comma-separated, e.g., temp,pressure): ").strip()
if keywords_input:
keywords = [kw.strip() for kw in keywords_input.split(",")]
max_dist = input("Enter max Levenshtein distance (default = 1): ").strip()
max_dist = int(max_dist) if max_dist.isdigit() else 1
filtered_df = fuzzy_filter_subjects_strict(df, keywords, max_l_dist=max_dist)
fuzzy_out = output_csv.replace(".csv", f"_fuzzy_{'_'.join(keywords)}.csv")
filtered_df.to_csv(fuzzy_out, index=False)
print(f"✅ Fuzzy match output saved to: {fuzzy_out}")
if __name__ == "__main__":
main()