Source code for FAIRLinked.QBWorkflow.rdf_data_cube_workflow

from FAIRLinked.QBWorkflow.input_handler import (
    check_if_running_experiment,
    get_domain,
    get_orcid,
    get_ontology_file,
    get_input_namespace_excel,
    get_input_data_excel,
    get_output_folder_path,
    get_dataset_name,
    get_namespace_for_dataset,
    has_all_ontology_files,
    has_existing_datacube_file,
    should_save_csv,
    choose_conversion_mode,
    check_ingestion,
    get_identifiers

)
import os
from FAIRLinked.QBWorkflow.mds_ontology_analyzer import get_classification
from FAIRLinked.QBWorkflow.namespace_template_generator import generate_namespace_excel
from FAIRLinked.QBWorkflow.data_template_generator import generate_data_xlsx_template
from FAIRLinked.QBWorkflow.namespace_parser import parse_excel_to_namespace_map
from FAIRLinked.QBWorkflow.data_parser import read_excel_template
from FAIRLinked.QBWorkflow.rdf_transformer import convert_dataset_to_rdf_with_mode
from FAIRLinked.QBWorkflow.rdf_to_df import parse_rdf_to_df
import traceback
from pprint import pprint


[docs]
def rdf_data_cube_workflow_start():
    """
    Welcome to FAIRLinked 🚀

    The entry point for the FAIRLinked data processing workflow using RDF Data Cube.

    Steps Overview:
    1. Checks if an existing RDF data cube file/folder is present.
       - If yes, parse it back to tabular format (optionally saving CSV).
    2. If no existing data cube, prompts whether the user is running an experiment or not.
       - If experiment, generate namespace & data templates (with optional ontology analysis).
       - Otherwise, parse existing Excel files for namespaces & data, 
         then convert them to RDF in 'entire' or 'row-by-row' mode.
    """
    print("Welcome to FAIRLinked RDF Data Cube 🚀")

    try:
        # 1) Check if an existing RDF data cube is present
        has_file, file_path = has_existing_datacube_file()
        
        if has_file:
            # The user has an existing data cube (or folder). Let's parse it.
            parse_existing_datacube_workflow(file_path)
            return

        # 2) If no existing file, we do the normal workflow
        is_experiment = check_if_running_experiment()
        if is_experiment:
            run_experiment_workflow()
        else:
            ingestion_mode = check_ingestion()
            if ingestion_mode:
                run_ingestion_workflow()
            else:
                run_standard_workflow()

    except Exception as e:
        print(f"An error occurred in the main workflow: {e}")
        # traceback.print_exc()
    finally:
        print("FAIRLinked exiting")




[docs]
def parse_existing_datacube_workflow(file_path: str):
    """
    If the user has an existing RDF data cube file or a directory of such files,
    parse it/them into a tabular format and optionally save as CSV.

    Args:
        file_path (str): Either a path to a single .ttl/.jsonld file 
                         or a directory containing multiple .ttl/.jsonld/.json-ld files.
    """
    try:
        # Ask user for an output folder (where we'll store the table + metadata)
        output_folder = get_output_folder_path()
        variable_metadata_path = os.path.join(output_folder, "variable_metadata.json")
        arrow_output_path = os.path.join(output_folder, "dataset.parquet")
        
        # Convert RDF data cube(s) => tabular format
        table, metadata = parse_rdf_to_df(
            file_path=file_path,
            variable_metadata_json_path=variable_metadata_path,
            arrow_output_path=arrow_output_path
        )
        print("Successfully parsed RDF data cube(s) to tabular format.")

        # Optionally prompt to save CSV
        if should_save_csv():
            csv_path = os.path.join(output_folder, "output.csv")
            table.to_pandas().to_csv(csv_path, index=False)
            print(f"✅ DataFrame saved to {csv_path}")
            
    except Exception as e:
        print(f"An error occurred while parsing the existing data cube: {e}")

        # traceback.print_exc()



[docs]
def run_experiment_workflow():
    """
    Generates namespace and data templates with optional ontology analysis for FAIRLinked.QBWorkflow.

    Steps:
    1. Check if the user has local ontology files (lowest-level & combined).
    2. If found, run classification => map terms to categories.
    3. Generate 'namespace_template.xlsx' and 'data_template.xlsx', 
       optionally populating with mapped terms.
    """
    try:
        if has_all_ontology_files():
            # Prompt user for the two ontology files
            lowest_level_path = get_ontology_file("Lowest-level MDS ontology file")
            combined_path = get_ontology_file("Combined MDS ontology file")
            mapped_terms, unmapped_terms = get_classification(lowest_level_path, combined_path)
            
            if unmapped_terms:
                print("\nWarning: The following terms could not be mapped to top-level categories:")
                pprint(unmapped_terms, indent=2, width=80)
                print()
        else:
            print("\nGenerating default templates without ontology analysis...")
            mapped_terms = {}
        
        # Generate templates
        generate_namespace_excel("./namespace_template.xlsx")
        generate_data_xlsx_template(mapped_terms, "./data_template.xlsx")
        
    except Exception as e:
        print(f"An error occurred in the experiment workflow: {e}")

        # traceback.print_exc()



[docs]
def run_standard_workflow():
    """
    Processes namespace and data Excel files to generate RDF outputs with FAIRLinked.QBWorkflow.

    Steps:
    1. Gather user inputs (ORCID, namespace/data Excel, output folder).
    2. Prompt for conversion mode (entire or row-by-row).
    3. If entire mode => ask for dataset name; if row-by-row => skip it.
    4. Parse the Excel templates => produce RDF using convert_dataset_to_rdf_with_mode.
    """
    try:
        orcid = get_orcid()
        namespace_excel_path = get_input_namespace_excel()
        data_excel_path = get_input_data_excel()
        output_folder_path = get_output_folder_path()

        # entire or row-by-row
        conversion_mode = choose_conversion_mode()

        if conversion_mode == "entire":
            dataset_name = get_dataset_name()
        else:
            # row-by-row doesn't require a single dataset name
            dataset_name = ""

        # Parse user-provided Excel for namespace map
        namespace_map = parse_excel_to_namespace_map(namespace_excel_path)

        # Read data Excel => variable_metadata + DataFrame
        variable_metadata, df = read_excel_template(data_excel_path)

        # Perform the conversion
        convert_dataset_to_rdf_with_mode(
            df=df,
            variable_metadata=variable_metadata,
            namespace_map=namespace_map,
            user_chosen_prefix='mds',
            output_folder_path=output_folder_path,
            orcid=orcid,
            dataset_name=dataset_name,
            fixed_dimensions=None,
            conversion_mode=conversion_mode
        )

    except Exception as e:
        print(f"An error occurred in the standard workflow: {e}")

        # traceback.print_exc()


[docs]
def run_ingestion_workflow():

    """
        Processes namespace and data Excel files to generate RDF outputs with FAIRLinked.QBWorkflow.

    Steps:
    1. Gather user inputs (ORCID, namespace/data Excel, output folder).
    2. Prompt for conversion mode (entire or row-by-row).
    3. If entire mode => ask for dataset name; if row-by-row => skip it.
    4. Parse the Excel templates => produce RDF using convert_dataset_to_rdf_with_mode.
    """
    try:
        orcid = get_orcid()
        namespace_excel_path = get_input_namespace_excel()
        data_excel_path = get_input_data_excel()
        output_folder_path = get_output_folder_path()

        conversion_mode = "CRADLE"

        dataset_name = ""

        # Parse user-provided Excel for namespace map
        namespace_map = parse_excel_to_namespace_map(namespace_excel_path)

        # Read data Excel => variable_metadata + DataFrame
        variable_metadata, df = read_excel_template(data_excel_path)

        # Perform the conversion
        convert_dataset_to_rdf_with_mode(
            df=df,
            variable_metadata=variable_metadata,
            namespace_map=namespace_map,
            user_chosen_prefix='mds',
            output_folder_path=output_folder_path,
            orcid=orcid,
            dataset_name=dataset_name,
            fixed_dimensions=None,
            conversion_mode=conversion_mode
        )

    except Exception as e:
        print(f"An error occurred in the standard workflow: {e}")

        # traceback.print_exc()