Source code for FAIRLinked.QBWorkflow.rdf_data_cube_workflow

from FAIRLinked.QBWorkflow.input_handler import (
    check_if_running_experiment,
    get_domain,
    get_orcid,
    get_ontology_file,
    get_input_namespace_excel,
    get_input_data_excel,
    get_output_folder_path,
    get_dataset_name,
    get_namespace_for_dataset,
    has_all_ontology_files,
    has_existing_datacube_file,
    should_save_csv,
    choose_conversion_mode,
    check_ingestion,
    get_identifiers

)
import os
from FAIRLinked.QBWorkflow.mds_ontology_analyzer import get_classification
from FAIRLinked.QBWorkflow.namespace_template_generator import generate_namespace_excel
from FAIRLinked.QBWorkflow.data_template_generator import generate_data_xlsx_template
from FAIRLinked.QBWorkflow.namespace_parser import parse_excel_to_namespace_map
from FAIRLinked.QBWorkflow.data_parser import read_excel_template
from FAIRLinked.QBWorkflow.rdf_transformer import convert_dataset_to_rdf_with_mode
from FAIRLinked.QBWorkflow.rdf_to_df import parse_rdf_to_df
import traceback
from pprint import pprint

[docs] def rdf_data_cube_workflow_start(): """ Welcome to FAIRLinked 🚀 The entry point for the FAIRLinked data processing workflow using RDF Data Cube. Steps Overview: 1. Checks if an existing RDF data cube file/folder is present. - If yes, parse it back to tabular format (optionally saving CSV). 2. If no existing data cube, prompts whether the user is running an experiment or not. - If experiment, generate namespace & data templates (with optional ontology analysis). - Otherwise, parse existing Excel files for namespaces & data, then convert them to RDF in 'entire' or 'row-by-row' mode. """ print("Welcome to FAIRLinked RDF Data Cube 🚀") try: # 1) Check if an existing RDF data cube is present has_file, file_path = has_existing_datacube_file() if has_file: # The user has an existing data cube (or folder). Let's parse it. parse_existing_datacube_workflow(file_path) return # 2) If no existing file, we do the normal workflow is_experiment = check_if_running_experiment() if is_experiment: run_experiment_workflow() else: ingestion_mode = check_ingestion() if ingestion_mode: run_ingestion_workflow() else: run_standard_workflow() except Exception as e: print(f"An error occurred in the main workflow: {e}") # traceback.print_exc() finally: print("FAIRLinked exiting")
[docs] def parse_existing_datacube_workflow(file_path: str): """ If the user has an existing RDF data cube file or a directory of such files, parse it/them into a tabular format and optionally save as CSV. Args: file_path (str): Either a path to a single .ttl/.jsonld file or a directory containing multiple .ttl/.jsonld/.json-ld files. """ try: # Ask user for an output folder (where we'll store the table + metadata) output_folder = get_output_folder_path() variable_metadata_path = os.path.join(output_folder, "variable_metadata.json") arrow_output_path = os.path.join(output_folder, "dataset.parquet") # Convert RDF data cube(s) => tabular format table, metadata = parse_rdf_to_df( file_path=file_path, variable_metadata_json_path=variable_metadata_path, arrow_output_path=arrow_output_path ) print("Successfully parsed RDF data cube(s) to tabular format.") # Optionally prompt to save CSV if should_save_csv(): csv_path = os.path.join(output_folder, "output.csv") table.to_pandas().to_csv(csv_path, index=False) print(f"✅ DataFrame saved to {csv_path}") except Exception as e: print(f"An error occurred while parsing the existing data cube: {e}")
# traceback.print_exc()
[docs] def run_experiment_workflow(): """ Generates namespace and data templates with optional ontology analysis for FAIRLinked.QBWorkflow. Steps: 1. Check if the user has local ontology files (lowest-level & combined). 2. If found, run classification => map terms to categories. 3. Generate 'namespace_template.xlsx' and 'data_template.xlsx', optionally populating with mapped terms. """ try: if has_all_ontology_files(): # Prompt user for the two ontology files lowest_level_path = get_ontology_file("Lowest-level MDS ontology file") combined_path = get_ontology_file("Combined MDS ontology file") mapped_terms, unmapped_terms = get_classification(lowest_level_path, combined_path) if unmapped_terms: print("\nWarning: The following terms could not be mapped to top-level categories:") pprint(unmapped_terms, indent=2, width=80) print() else: print("\nGenerating default templates without ontology analysis...") mapped_terms = {} # Generate templates generate_namespace_excel("./namespace_template.xlsx") generate_data_xlsx_template(mapped_terms, "./data_template.xlsx") except Exception as e: print(f"An error occurred in the experiment workflow: {e}")
# traceback.print_exc()
[docs] def run_standard_workflow(): """ Processes namespace and data Excel files to generate RDF outputs with FAIRLinked.QBWorkflow. Steps: 1. Gather user inputs (ORCID, namespace/data Excel, output folder). 2. Prompt for conversion mode (entire or row-by-row). 3. If entire mode => ask for dataset name; if row-by-row => skip it. 4. Parse the Excel templates => produce RDF using convert_dataset_to_rdf_with_mode. """ try: orcid = get_orcid() namespace_excel_path = get_input_namespace_excel() data_excel_path = get_input_data_excel() output_folder_path = get_output_folder_path() # entire or row-by-row conversion_mode = choose_conversion_mode() if conversion_mode == "entire": dataset_name = get_dataset_name() else: # row-by-row doesn't require a single dataset name dataset_name = "" # Parse user-provided Excel for namespace map namespace_map = parse_excel_to_namespace_map(namespace_excel_path) # Read data Excel => variable_metadata + DataFrame variable_metadata, df = read_excel_template(data_excel_path) # Perform the conversion convert_dataset_to_rdf_with_mode( df=df, variable_metadata=variable_metadata, namespace_map=namespace_map, user_chosen_prefix='mds', output_folder_path=output_folder_path, orcid=orcid, dataset_name=dataset_name, fixed_dimensions=None, conversion_mode=conversion_mode ) except Exception as e: print(f"An error occurred in the standard workflow: {e}")
# traceback.print_exc()
[docs] def run_ingestion_workflow(): """ Processes namespace and data Excel files to generate RDF outputs with FAIRLinked.QBWorkflow. Steps: 1. Gather user inputs (ORCID, namespace/data Excel, output folder). 2. Prompt for conversion mode (entire or row-by-row). 3. If entire mode => ask for dataset name; if row-by-row => skip it. 4. Parse the Excel templates => produce RDF using convert_dataset_to_rdf_with_mode. """ try: orcid = get_orcid() namespace_excel_path = get_input_namespace_excel() data_excel_path = get_input_data_excel() output_folder_path = get_output_folder_path() conversion_mode = "CRADLE" dataset_name = "" # Parse user-provided Excel for namespace map namespace_map = parse_excel_to_namespace_map(namespace_excel_path) # Read data Excel => variable_metadata + DataFrame variable_metadata, df = read_excel_template(data_excel_path) # Perform the conversion convert_dataset_to_rdf_with_mode( df=df, variable_metadata=variable_metadata, namespace_map=namespace_map, user_chosen_prefix='mds', output_folder_path=output_folder_path, orcid=orcid, dataset_name=dataset_name, fixed_dimensions=None, conversion_mode=conversion_mode ) except Exception as e: print(f"An error occurred in the standard workflow: {e}")
# traceback.print_exc()