Source code for CanGraph.ExposomeExplorer.build_database

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# SPDX-FileCopyrightText: 2022 Pablo Marcos <software@loreak.org>
#
# SPDX-License-Identifier: MIT

"""
A python module that provides the necessary functions to transition the Exposome Explorer database to graph format,
either from scratch importing all the nodes (as showcased in :obj:`CanGraph.ExposomeExplorer.main`) or in a case-by-case basis,
to annotate existing metabolites (as showcased in :obj:`CanGraph.main`).
"""

# Import external modules necessary for the script
import os, sys, shutil               # Vital modules to interact with the filesystem

# Import subscripts for the program
# This hack that allows us to de-duplicate the miscleaneous script in this less-used script
sys.path.append("../")
# .. NOTE::: Please beware that, if using this module by itself, you might need to copy "miscelaneous.py" into your path
# This is not the most elegant, but simplifies code maintenance, and this script shouldnt be used much so...
import miscelaneous as misc

# ********* First, we add some general functions to start the discovery or do it automatically ********* #

[docs]def import_csv(filename, label): """ Imports a given CSV into Neo4J. This CSV **must** be present in Neo4J's Import Path Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported label (str): The label of the Neo4J nodes that will be imported, with the columns of the CSV being its properties. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. .. NOTE:: For this to work, you HAVE TO have APOC availaible on your Neo4J installation """ return (f""" CALL apoc.import.csv([{{fileName: 'file:/{filename}', labels: [apoc.text.capitalize('{label}')]}}], [], {{}}) """)
[docs]def add_components(filename): """ Adds "Metabolite" nodes from Exposome-Explorer's components.csv This is because this components are, in fact, metabolites, either from food or from human metabolism Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line MERGE (c:Metabolite {{ Exposome_Explorer_ID:"Component_"+line["id"] }}) SET c.Name = line.name, c.Description = line.description, c.Alternative_Names = replace(line.alternative_names, ";", ","), c.Level = line.level, c.CAS_Number = line.cas_number, c.PubChem_ID = line.pubchem_compound_id, c.ChEBI_ID = toInteger(line.chebi_id), c.FooDB_Compound_ID = line.foodb_compound_id, c.HMDB_ID = line.hmdb_id, c.FooDB_Food_ID = line.foodb_food_id, c.SMILES = line.moldb_smiles, c.Formula = line.moldb_formula, c.InChI = line.moldb_inchi, c.InChIKey = line.moldb_inchikey, c.Average_Mass = line.moldb_average_mass, c.Monisotopic_Molecular_Weight = line.moldb_mono_mass, c.Displayed_Excretion_Concentration_Count = line.displayed_excretion_concentration_count, c.Displayed_Correlated_Biomarker_Count = line.displayed_correlated_biomarker_count, c.Displayed_Metabolomic_Associated_Biomarker_Count = line.displayed_metabolomic_associated_biomarker_count, c.Displayed_Associated_Biomarker_Count = line.displayed_associated_biomarker_count, c.Displayed_Reproducibility_Count = line.displayed_reproducibility_count, c.Displayed_Cancer_Association_Count = line.displayed_cancer_association_count, c.Displayed_Intake_Value_Count = line.displayed_intake_value_count, c.Displayed_Intakes_Correlated_with_Excretion_Count = line.displayed_intakes_correlated_with_excretion_count, c.Displayed_Excretions_Correlated_with_Intake_Count = line.displayed_excretions_correlated_with_intake_count, c.Displayed_Excretions_Associated_with_Intake_Count = line.displayed_excretions_associated_with_intake_count, c.Displayed_Intakes_Associated_with_Excretion_Count = line.displayed_intakes_associated_with_excretion_count, c.Displayed_Publication_Count = line.displayed_publication_count, c.Displayed_Microbial_Metabolite_Identification_count = line.displayed_microbial_metabolite_identification_count, c.Displayed_Proof_2_Publications_Count = line.displayed_proof_2_publications_count, c.Displayed_Proof_3_Publications_Count = line.displayed_proof_3_publications_count, c.Displayed_Proof_4_Publications_Count = line.displayed_proof_4_publications_count, c.Displayed_Nb_of_proofs = line.displayed_nb_of_proofs """)
# ********* Now, we build the "scaffolding" - the raw nodes which we will then annotate ********* #
[docs]def add_measurements_stuff(filename): """ A massive and slow-running function that creates ALL the relations between the 'measurements' table and all other related tables: - units: The units in which a given measurement is expressed - components: The component which is being measured - samples: The sample from which a measurement is taken - experimental_methods: The method used to take a measurement Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM 'file:///{filename}' AS line MATCH (c:Metabolite {{ Exposome_Explorer_ID: "Component_"+line.component_id }}) MERGE (m:Measurement {{ Exposome_Explorer_ID: "Measurement_"+line.id }}) MERGE (s:Sample {{ Exposome_Explorer_ID: "Sample_"+line.sample_id }}) MERGE (me:ExperimentalMethod {{ Exposome_Explorer_ID: "ExperimentalMethod_"+line.experimental_method_id }}) MERGE (u:Unit {{ Exposome_Explorer_ID: "Unit_"+line.unit_id }}) MERGE (c)-[r1:MEASURED_AS]->(m) MERGE (m)-[r2:TAKEN_FROM_SAMPLE]->(s) MERGE (m)-[r3:USING_METHOD]->(me) MERGE (m)-[r4:MEASURED_IN]->(u) """)
[docs]def add_reproducibilities(filename): """ Creates relations between the "reproducibilities" and the "measurements" table, using "initial_id", an old identifier, for the linkage Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM 'file:///{filename}' AS line MATCH (m:Measurement {{ Exposome_Explorer_ID: "Measurement_"+line.id }}) MERGE (re:Reproducibility {{ Exposome_Explorer_ID: "Reproducibility_"+line.id }}) MERGE (m)-[r:REPODUCIBILE_WITH_CONDITIONS]->(re) """)
[docs]def add_samples(filename): """ Imports the relations pertaining to the "samples" table. A sample will be taken from a given subject and a given tissue (that is, a specimen, which will be blood, urine, etc) Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM 'file:///{filename}' AS line MATCH (s:Sample {{ Exposome_Explorer_ID: "Sample_"+line.id }}) MERGE (sp:BioSpecimen {{ Exposome_Explorer_ID: "Specimen_"+line.specimen_id }}) MERGE (sb:Subject {{ Exposome_Explorer_ID: "Subject_"+line.subject_id }}) MERGE (s)-[r1:FOUND_IN]->(sp) MERGE (s)-[r2:TAKEN_FROM_SUBJECT]->(sb) """)
[docs]def add_subjects(filename): """ Imports the relations pertaining to the "subjects" table. Basically, a subject can appear in a given publication, and will be part of a cohort (i.e. a grop of subjects) Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM 'file:///{filename}' AS line MATCH (s:Subject {{ Exposome_Explorer_ID: "Subject_"+line.id }}) MERGE (p:Publication {{ Exposome_Explorer_ID: "Publication_"+line.publication_id }}) MERGE (s)-[r2:CITED_IN]->(p) FOREACH(ignoreMe IN CASE WHEN line.cohort_id IS NOT null THEN [1] ELSE [] END | MERGE (c:Cohort {{ Exposome_Explorer_ID: "Cohort_"+line.cohort_id }}) MERGE (s)-[r1:PART_OF_COHORT]->(c) ) """)
[docs]def add_microbial_metabolite_identifications(filename): """ Imports the relations pertaining to the "microbial_metabolite_identifications" table. A component (i.e. a metabolite) can be identified as a Microbial Metabolite, which means it has an equivalent in the microbiome. This can have a given reference and a tissue (BioSpecimen) in which it occurs. Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM 'file:///{filename}' AS line MATCH (m:Metabolite {{ Exposome_Explorer_ID: "Component_"+line.component_id }}) SET m.MicobialMetabolite_ID = line.id MERGE (p:Publication {{ Exposome_Explorer_ID: "Publication_"+line.publication_id }}) MERGE (m)-[r1:CITED_IN]->(p) FOREACH(ignoreMe IN CASE WHEN line.specimen_id IS NOT null THEN [1] ELSE [] END | MERGE (s:BioSpecimen {{ Exposome_Explorer_ID: "Specimen_"+line.specimen_id }}) MERGE (m)-[r3:LOCATED_IN_BIOSPECIMEN]->(s) ) """)
[docs]def add_cancer_associations(filename): """ Imports the 'cancer_associations' database as a relation between a given Cancer and a Measurement Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM 'file:///{filename}' AS line MATCH (m:Measurement {{ Exposome_Explorer_ID: "Measurement_"+line.`excretion_id` }}) MERGE (c:Disease {{ Exposome_Explorer_ID: "Cancer_"+line.`cancer_id` }}) MERGE (m)-[r:ASSOCIATED_DISEASE_MEASUREMENT]-(c) SET r.Exposome_Explorer_ID = "CancerAssociation_"+line.id """)
[docs]def add_metabolomic_associations(filename): """ Imports the 'metabolomic_associations' database as a relation between to measurements: the intake_id, a food taken by the organism and registered using dietary questionnaires and the excretion_id, a chemical found in human biological samples, such that, when one takes one component, one will excrete the other. Data comes from Metabolomics studies seeking to identify putative dietary biomarkers. Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM 'file:///{filename}' AS line MATCH (m:Measurement {{ Exposome_Explorer_ID: "Measurement_"+line.`intake_id` }}) MATCH (n:Measurement {{ Exposome_Explorer_ID: "Measurement_"+line.`excretion_id` }}) MERGE (m)-[r:ASSOCIATED_WITH_MEASUREMENT]->(n) SET r.Exposome_Explorer_ID = "MetabolomicAssociation_"+line.id, r.Feature_Selection = line.feature_selection, r.Author_Structural_Identification_Level = line.author_structural_identification_level, r.Area_Under_Curve_Prefixe = line.area_under_curve_prefixe, r.Area_Under_Curve = line.area_under_curve, r.sensitivity_prefixe = line.sensitivity_prefixe, r.Sensitivity = line.sensitivity, r.specificity_prefixe = line.specificity_prefixe, r.Specificity = line.specificity, r.plsda_vip = line.plsda_vip, r.Beta_Coefficient = line.beta_coefficient, r.Beta_Coefficient_p_value = line.beta_coefficient_p_value, r.anova_p_value = line.anova_p_value """)
[docs]def add_correlations(filename): """ Imports the 'correlations' database as a relation between two measurements: the intake_id, a food taken by the organism and registered using dietary questionnaires and the excretion_id, a chemical found in human biological samples, such that, when one takes one component, one will excrete the other. Data comes from epidemiological studies where dietary questionnaires are administered, and biomarkers are measured in specimens Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM 'file:///{filename}' AS line MATCH (m:Measurement {{ Exposome_Explorer_ID: "Measurement_"+line.`intake_id` }}) MATCH (n:Measurement {{ Exposome_Explorer_ID: "Measurement_"+line.`excretion_id` }}) MERGE (m)-[r:ASSOCIATED_WITH_MEASUREMENT]-(n) SET r.Exposome_Explorer_ID = "Correlation_"+line.id, r.Coefficient_Type = line.coefficient_type, r.Coefficient_Value = line.coefficient_value, r.p_value = line.p_value, r.p_value_prefixe = line.p_value_prefixe, r.Confidence_Interval_95_Lower = line.confidence_interval_95_lower, r.Confidence_Interval_95_Upper = line.confidence_interval_95_upper, r.Is_Significant = line.is_significant, r.covariates = line.covariates, r.Intake_ID = line.intake_id, r.Excretion_ID = line.excretion_id, r.Measurement_Adjustment = line.measurement_adjustment, r.Deattenuation = line.deattenuation, r.size = line.size """)
# ********* Finally, we can annotate the nodes created ********* #
[docs]def annotate_measurements(filename): """ Adds "Measurement" nodes from Exposome-Explorer's measurements.csv Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line MATCH (m:Measurement {{ Exposome_Explorer_ID:"Measurement_"+line["id"] }}) SET m.Concentration_Mean = line.concentration_mean, m.Concentration_Median = line.concentration_median, m.Concentration_Min = line.concentration_min, m.Concentration_Max = line.concentration_max, m.Concentration_Percentile_05 = line.concentration_percentile_05, m.Concentration_Percentile_10 = line.concentration_percentile_10, m.Concentration_Percentile_25 = line.concentration_percentile_25, m.Concentration_Percentile_75 = line.concentration_percentile_75, m.Concentration_Percentile_90 = line.concentration_percentile_90, m.Concentration_Percentile_95 = line.concentration_percentile_95, m.Concentration_InterQuartile_Range = line.concentration_interquartile_range, m.Confidence_Interval_95_Lower = line.confidence_interval_95_lower, m.Confidence_Interval_95_Upper = line.confidence_interval_95_upper, m.Concentration_SD = line.Concentration_SD, m.Size = line.size, m.Component_ID = line.component_id, m.Sample_ID = line.sample_id, m.Experimental_Method_ID = line.experimental_method_id, m.Ancestry = line.ancestry, m.Regressed_On = line.regressed_on, m.Unit_ID = line.unit_id, m.Adjustment_Type = line.adjustment_type, m.Adjusted_On = line.adjusted_on, m.Expressed_as_ID = line.expressed_as_id, m.Supplement_Inclusion = line.supplement_inclusion, m.Detected_Proportion = line.detected_proportion, m.Detected_Size = line.detected_size, m.Food_Items = line.food_items, m.Concentration_GeoMean = line.concentration_geomean, m.Concentration_GeoSD = line.concentration_geosd, m.Concentration_Detected_Min = line.concentration_detected_min, m.Detected_Only = line.detected_only, m.Confidence_Interval_95_Geo_Lower = line.confidence_interval_95_geo_lower, m.Confidence_Interval_95_Geo_Upper = line.confidence_interval_95_geo_upper """)
[docs]def annotate_samples(filename): """ Adds "Sample" nodes from Exposome-Explorer's samples.csv From a Sample, one can take a series of measurements Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line MATCH (s:Sample {{ Exposome_Explorer_ID:"Sample_"+line["id"] }}) SET s.Subject_ID = line.subject_id, s.Ancestry = line.ancestry, s.Repetitions = line.repetitions, s.Time = line.time, s.Specimen_ID = line.specimen_id, s.Time_Definition = line.time_definition, s.Intake_Tool = line.intake_tool, s.Intake_Food_Coverage = line.intake_food_coverage, s.Intake_Time_Coverage = line.intake_time_coverage, s.Intervention_Dose = line.intervention_dose """)
[docs]def annotate_experimental_methods(filename): """ Adds "ExperimentalMethod" nodes from Exposome-Explorer's experimental_methods.csv Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line MATCH (em:ExperimentalMethod {{ Exposome_Explorer_ID:"ExperimentalMethod_"+line["id"] }}) SET em.Name = line.name, em.Method_Type = line.method_type, em.Alternative_Names = replace(line.alternative_names, ";", ","), em.Displayed_Excretion_Concentration_Count = line.displayed_excretion_concentration_count, em.Displayed_Biomarker_Count = line.displayed_biomarker_count, em.Displayed_Publication_Count = line.displayed_publication_count, em.Displayed_Reproducibility_Count = line.displayed_reproducibility_count, em.Displayed_Excretions_Correlated_with_Intake_Count = line.displayed_excretions_correlated_with_intake_count, em.Displayed_Cancer_Association_Count = line.displayed_cancer_association_count """)
[docs]def annotate_units(filename): """ Adds "Unit" nodes from Exposome-Explorer's units.csv A unit can be converted into other (for example, for normalization) Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line MATCH (u:Unit {{ Exposome_Explorer_ID:"Unit_"+line["id"] }}) SET u.Name = line.name, u.Type = line.unit_type, u.Group = line.unit_group, u.Converted_to_ID = line.converted_to_id """)
[docs]def annotate_auto_units(filename): """ Shows the correlations between two units, converted using the rubygem 'https://github.com/masa16/phys-units' which standarizes units of measurement for our data Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM 'file:///{filename}' AS line MATCH (u1:Unit {{ Exposome_Explorer_ID: "Unit_"+line.id }}), (u2:Unit {{ Exposome_Explorer_ID: "Unit_"+line.id }}) MERGE (u1)-[r:CONVERTED_INTO]->(u2) """)
[docs]def annotate_cancers(filename): """ Adds "Cancer" nodes from Exposome-Explorer's cancers.csv Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line MATCH (c:Disease {{ Exposome_Explorer_ID:"Cancer_"+line["id"] }}) SET c.Name = line.name, c.Alternative_Names = replace(line.alternative_names, ";", ","), c.Displayed_Publication_Count = line.displayed_publication_count, c.Displayed_Cancer_Association_Count = line.displayed_cancer_association_count, c.Displayed_Biomarker_Count = line.displayed_biomarker_count """)
[docs]def annotate_cohorts(filename): """ Adds "Cohort" nodes from Exposome-Explorer's cohorts.csv Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line MATCH (c:Cohort {{ Exposome_Explorer_ID:"Cohort_"+line["id"] }}) SET c.Name = line.name, c.Abbreviation = line.abbreviation, c.Description = c.description, c.Citation = line.citation, c.Displayed_Biomarker_Count = line.displayed_biomarker_count, c.Displayed_Excretion_Concentration_Count = line.displayed_excretion_concentration_count, c.Study_Design_Type = line.study_design_type, c.PubMed_ID = line.pmid, c.URL = line.url, c.Country = line.country, c.Displayed_Publication_Count = line.displayed_publication_count, c.Displayed_Intake_Value_Count = line.displayed_intake_value_count, c.Displayed_Correlation_Count = line.displayed_correlation_count, c.Displayed_Metabolomic_Association_Count = line.displayed_metabolomic_association_count, c.Displayed_Cancer_Association_Count = line.displayed_cancer_association_count """)
[docs]def annotate_microbial_metabolite_info(filename): """ Adds "Metabolite" nodes from Exposome-Explorer's microbial_metabolite_identifications.csv These represent all metabolites that have been re-identified as present, for instance, in the microbiome. Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line MATCH (mm:Metabolite {{ Exposome_Explorer_ID:"Component_"+line["component_id"] }}) SET mm.Publication_ID = line.publication_id, mm.Component_ID = line.component_id, mm.Antibiotic = line.antibiotic, mm.Identification_Method = line.identified_by, mm.Specimen_ID = line.specimen_id, mm.Bacterial_Source = line.bacterial_source, mm.Substrate = line.substrate, mm.Organism = line.organism """)
[docs]def annotate_publications(filename): """ Adds "Publication" nodes from Exposome-Explorer's publications.csv Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line MATCH (p:Publication {{ Exposome_Explorer_ID:"Publication_"+line["id"] }}) SET p.Title = line.title, p.First_Author = line.author_first, p.Date = line.year, p.Publication = line.journal, p.Volume = line.volume, p.Issue = line.issue, p.Pages = line.Pages, p.PubMed_ID = line.pmid, p.Authors = line.authors, p.DOI = line.doi, p.Public = line.public, p.Metabolomics = line.metabolomics, p.Intake_Count = line.intake_count, p.Intake_Value_Count = line.intake_value_count, p.Excretion_Count = line.excretion_count, p.Excretion_Value_Count = line.excretion_value_count, p.Correlation_Value_Count = line.correlation_value_count, p.Reproducibility_Value_Count = line.reproducibility_value_count, p.Metabolomic_Association_Count = line.metabolomic_association_count, p.Study_Design_Type = line.study_design_type, p.Full_Annotation = line.full_annotation, p.Cancer_Association_Count = line.cancer_association_count, p.Displayed_Biomarker_Count = line.displayed_biomarker_count, p.Microbial_Metabolite_Identification_Count = line.microbial_metabolite_identification_count """)
[docs]def annotate_reproducibilities(filename): """ Adds "Reproducibility" nodes from Exposome-Explorer's reproducibilities.csv These represent the conditions under which a given study/measurement was carried Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line MATCH (r:Reproducibility {{ Exposome_Explorer_ID:"Reproducibility_"+line["id"] }}) SET r.Initial_ID = line.initial_id, r.ICC = line.icc, r.ICC_Confidence_Interval_95_Lower = line.icc_confidence_interval_95_lower, r.ICC_Confidence_Interval_95_Upper = line.ICC_Confidence_Interval_95_Upper, r.CV_Within = line.cv_within, r.CV_Between = line.cv_between, r.Variance_Within = line.variance_within, r.Size = line.size """)
[docs]def annotate_specimens(filename): """ Annotates "BioSpecimen" nodes from Exposome-Explorer's specimens.csv whose ID is already present on the DB A biospecimen is a type of tissue where a measurement can originate, such as orine, csf fluid, etc Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line MATCH (s:BioSpecimen {{ Exposome_Explorer_ID:"Specimen_"+line["id"] }}) SET s.Name = line.name, s.Specimen_Type = line.specimen_type, s.Displayed_Excretion_Concentration_Count = line.displayed_excretion_concentration_count, s.Displayed_Biomarker_Count = line.displayed_biomarker_count, s.Displayed_Publication_Count = line.displayed_publication_count, s.Displayed_Reproducibility_Count = line.displayed_reproducibility_count, s.Displayed_Excretions_Correlated_with_Intake_Count = line.displayed_excretions_correlated_with_intake_count, s.Displayed_Excretions_Associated_with_Intake_Count = line.displayed_excretions_associated_with_intake_count, s.Displayed_Cancer_Association_Count = line.displayed_cancer_association_count """)
[docs]def annotate_subjects(filename): """ Annotates "Subject" nodes from Exposome-Explorer's subjects.csv whose ID is already present on the DB Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the CSV file that is being imported. Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" LOAD CSV WITH HEADERS FROM ('file:///subjects.csv') AS line MATCH (su:Subject {{ Exposome_Explorer_ID:"Subject_"+line["id"] }}) SET su.Name = line.name, su.Description = line.description, su.Health_Condition = line.health_condition, su.Country = line.country, su.Ethnicity = line.ethny, su.Gender = line.gender, su.Female_Proportion = line.female_proportion, su.Size = line.size, su.Age_Mean = line.age_mean, su.Age_Min = line.age_min, su.Age_Max = line.age_max, su.Age_Median = line.age_median, su.Age_SD = line.age_sd, su.Height_Mean = line.height_mean, su.Height_Min = line.height_min, su.Height_Max = line.height_max, su.Height_Median = line.height_median, su.Height_SD = line.height_sd, su.Weight_Mean = line.weight_mean, su.Weight_Min = line.weight_min, su.Weight_Max = line.weight_max, su.Weight_Median = line.weight_median, su.Weight_SD = line.weight_sd, su.BMI_Mean = line.bmi_mean, su.BMI_Min = line.bmi_min, su.BMI_Max = line.bmi_max, su.BMI_Median = line.bmi_median, su.BMI_SD = line.bmi_sd, su.Publication_ID = line.publication_id, su.Ancestry = line.ancestry, su.Supplement_Exclusion = line.supplement_exclusion, su.Smoker_Proportion = line.smoker_proportion, su.Cohort_ID = line.cohort_id, su.Nb_of_Cases = line.nb_of_cases, su.Nb_of_Controls = line.nb_of_controls """)
[docs]def remove_counts_and_displayeds(inputfile, outputfile): """ Removes ```_count``` & ```displayed_``` text-strings from a given file, so that, when processing it with the other functions present in this document, they ignore the columns containing said text-strings, which represent properties which are considered not useful for our program. This is. of course, not the most elegant, but it works. Args: inputfile (str): The path to the file from which ```_count``` & ```displayed_`` text-strings are to be removed outputfile (str): The path of the file where the contents of the replaced file will be written. Returns: The function does not have a return; instead, it transforms ```inputfile```` into ```outputfile``` """ data = "" with open(inputfile, 'r') as f : data = f.read() data = data.replace('_count', '') data = data.replace('displayed_', '') with open(outputfile, 'w') as f: f.write(data)
[docs]def remove_cross_properties(): """ Removes some properties that were added by the other functions present in this script, that are used to cross-reference the different tables in the Relational Database EE comes from, and that, in a Graph Database, are no longer necessary. Args: tx (neo4j.Session): The session under which the driver is running Returns: neo4j.Result: A Neo4J connexion to the database that modifies it accordingly. """ return (f""" MATCH (n) REMOVE n.Component_ID, n.Sample_ID, n.Experimental_Method_ID, n.Unit_ID, n.Subject_ID, n.Converted_to_ID, n.Publication_ID, n.Component_ID, n.Specimen_ID, n.Initial_ID, n.Cohort_ID """)
[docs]def build_from_file(databasepath, Neo4JImportPath, driver, bar = None, do_all = False, keep_counts_and_displayeds = True, keep_cross_properties = False): """ A function able to build a portion of the Exposome-Explorer database in graph format, provided that at least one "Component" (Metabolite) node is present in said database. It works by using that node as an starting point from which to search in the rest of the Exposome_Explorer database, finding related nodes there. Args: databasepath (str): The path to the database where all Exposome-Explorer CSVs are stored Neo4JImportPath (str): The path from which Neo4J is importing data driver (neo4j.Driver): Neo4J's Bolt Driver currently in use bar: The bar() object from alive_bar, in case we want the function to run with do_all=True do_all (bool): True if importing the whole database; False if just importing a part of it keep_counts_and_displayeds (bool): Whether to keep the properties ending with ```_count``` & ```displayed_``` that, although present in the original DB, might be considered not useful for us. keep_cross_properties (bool): Whether to keep the properties used to cross-reference in the original Neo4J database. Returns: This function modifies the Neo4J Database as desired, but does not produce any particular return. .. NOTE:: This wont work if a "Component" (Metabolite) node is not already present; when building the database, either full or by parts, you should import the respective Components first .. WARNING:: Due to the script's design, only nodes which have a connection to nodes previously present on the database will be imported. This is on purpose: unconnected nodes don't mean much in a Graph DataBase """ # Set the databasepath to be an absolute path databasepath = os.path.abspath(databasepath) if keep_counts_and_displayeds == False: remove_counts_and_displayeds(f"{databasepath}/measurements.csv", f"{Neo4JImportPath}/measurements.csv") remove_counts_and_displayeds(f"{databasepath}/samples.csv", f"{Neo4JImportPath}/samples.csv") remove_counts_and_displayeds(f"{databasepath}/experimental_methods.csv", f"{Neo4JImportPath}/experimental_methods.csv") remove_counts_and_displayeds(f"{databasepath}/units.csv", f"{Neo4JImportPath}/units.csv") remove_counts_and_displayeds(f"{databasepath}/reproducibilities.csv", f"{Neo4JImportPath}/reproducibilities.csv") remove_counts_and_displayeds(f"{databasepath}/subjects.csv", f"{Neo4JImportPath}/subjects.csv") remove_counts_and_displayeds(f"{databasepath}/microbial_metabolite_identifications.csv", f"{Neo4JImportPath}/microbial_metabolites.csv") remove_counts_and_displayeds(f"{databasepath}/cancer_associations.csv", f"{Neo4JImportPath}/cancer_associations.csv") remove_counts_and_displayeds(f"{databasepath}/metabolomic_associations.csv", f"{Neo4JImportPath}/metabolomic_associations.csv") remove_counts_and_displayeds(f"{databasepath}/correlations.csv", f"{Neo4JImportPath}/correlations.csv") remove_counts_and_displayeds(f"{databasepath}/cancers.csv", f"{Neo4JImportPath}/cancers.csv") remove_counts_and_displayeds(f"{databasepath}/cohorts.csv", f"{Neo4JImportPath}/cohorts.csv") remove_counts_and_displayeds(f"{databasepath}/publications.csv", f"{Neo4JImportPath}/publications.csv") remove_counts_and_displayeds(f"{databasepath}/specimens.csv", f"{Neo4JImportPath}/specimens.csv") remove_counts_and_displayeds(f"{databasepath}/subjects.csv", f"{Neo4JImportPath}/subjects.csv") else: shutil.copyfile(f"{databasepath}/measurements.csv", f"{Neo4JImportPath}/measurements.csv") shutil.copyfile(f"{databasepath}/samples.csv", f"{Neo4JImportPath}/samples.csv") shutil.copyfile(f"{databasepath}/experimental_methods.csv", f"{Neo4JImportPath}/experimental_methods.csv") shutil.copyfile(f"{databasepath}/units.csv", f"{Neo4JImportPath}/units.csv") shutil.copyfile(f"{databasepath}/reproducibilities.csv", f"{Neo4JImportPath}/reproducibilities.csv") shutil.copyfile(f"{databasepath}/subjects.csv", f"{Neo4JImportPath}/subjects.csv") shutil.copyfile(f"{databasepath}/microbial_metabolite_identifications.csv", f"{Neo4JImportPath}/microbial_metabolites.csv") shutil.copyfile(f"{databasepath}/cancer_associations.csv", f"{Neo4JImportPath}/cancer_associations.csv") shutil.copyfile(f"{databasepath}/metabolomic_associations.csv", f"{Neo4JImportPath}/metabolomic_associations.csv") shutil.copyfile(f"{databasepath}/correlations.csv", f"{Neo4JImportPath}/correlations.csv") shutil.copyfile(f"{databasepath}/cancers.csv", f"{Neo4JImportPath}/cancers.csv") shutil.copyfile(f"{databasepath}/cohorts.csv", f"{Neo4JImportPath}/cohorts.csv") shutil.copyfile(f"{databasepath}/publications.csv", f"{Neo4JImportPath}/publications.csv") shutil.copyfile(f"{databasepath}/specimens.csv", f"{Neo4JImportPath}/specimens.csv") shutil.copyfile(f"{databasepath}/subjects.csv", f"{Neo4JImportPath}/subjects.csv") # Fist, we build the "scaffolding" - the nodes we will annotate later on misc.manage_transaction(add_measurements_stuff("measurements.csv"), driver) if do_all: bar() misc.manage_transaction(add_reproducibilities("reproducibilities.csv"), driver) if do_all: bar() misc.manage_transaction(add_samples("samples.csv"), driver) if do_all: bar() misc.manage_transaction(add_subjects("subjects.csv"), driver) if do_all: bar() misc.manage_transaction(add_microbial_metabolite_identifications( "microbial_metabolites.csv"), driver) if do_all: bar() misc.manage_transaction(add_cancer_associations("cancer_associations.csv"), driver) if do_all: bar() misc.manage_transaction(add_metabolomic_associations( "metabolomic_associations.csv"), driver) if do_all: bar() misc.manage_transaction(add_correlations("correlations.csv"), driver) if do_all: bar() # Now, we annotate those metabolites misc.manage_transaction(annotate_measurements("measurements.csv"), driver) if do_all: bar() misc.manage_transaction(annotate_samples("samples.csv"), driver) if do_all: bar() misc.manage_transaction(annotate_experimental_methods( "experimental_methods.csv"), driver) if do_all: bar() misc.manage_transaction(annotate_units("units.csv"), driver) if do_all: bar() misc.manage_transaction(annotate_auto_units("units.csv"), driver) if do_all: bar() misc.manage_transaction(annotate_cancers("cancers.csv"), driver) if do_all: bar() misc.manage_transaction(annotate_cohorts("cohorts.csv"), driver) if do_all: bar() misc.manage_transaction(annotate_microbial_metabolite_info( "microbial_metabolites.csv"), driver) if do_all: bar() misc.manage_transaction(annotate_publications("publications.csv"), driver) if do_all: bar() misc.manage_transaction(annotate_reproducibilities("reproducibilities.csv"), driver) if do_all: bar() misc.manage_transaction(annotate_specimens("specimens.csv"), driver) if do_all: bar() misc.manage_transaction(annotate_subjects("subjects.csv"), driver) if do_all: bar() # Finally, we remove the cross-properties that are of no use anymore (this is optional, of course) if keep_cross_properties == False: misc.manage_transaction(remove_cross_properties(), driver) os.remove(f"{Neo4JImportPath}/measurements.csv"); os.remove(f"{Neo4JImportPath}/samples.csv") os.remove(f"{Neo4JImportPath}/experimental_methods.csv"); os.remove(f"{Neo4JImportPath}/units.csv") os.remove(f"{Neo4JImportPath}/reproducibilities.csv"); os.remove(f"{Neo4JImportPath}/subjects.csv") os.remove(f"{Neo4JImportPath}/microbial_metabolites.csv"); os.remove(f"{Neo4JImportPath}/cancer_associations.csv") os.remove(f"{Neo4JImportPath}/metabolomic_associations.csv"); os.remove(f"{Neo4JImportPath}/correlations.csv") os.remove(f"{Neo4JImportPath}/cancers.csv"); os.remove(f"{Neo4JImportPath}/cohorts.csv") os.remove(f"{Neo4JImportPath}/publications.csv"); os.remove(f"{Neo4JImportPath}/specimens.csv")