Source code for CanGraph.GraphifyHMDB.build_database

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# SPDX-FileCopyrightText: 2022 Pablo Marcos <software@loreak.org>
#
# SPDX-License-Identifier: MIT

"""
A python module that provides the necessary functions to transition the HMDB database to graph format,
either from scratch importing all the nodes (as showcased in :obj:`CanGraph.GraphifyHMDB.main`) or in a case-by-case basis,
to annotate existing metabolites (as showcased in :obj:`CanGraph.main`).
"""

# Import external modules necessary for the script
from alive_progress import alive_bar # A cute progress bar that shows the script is still running
import os, sys, shutil               # Vital modules to interact with the filesystem
from time import sleep               # A hack to avoid starving the system resources

# Import subscripts for the program
# This hack that allows us to de-duplicate the miscleaneous script in this less-used script
sys.path.append("../")
# .. NOTE:: Please beware that, if using this module by itself, you might need to copy "miscelaneous.py" into your path
# This is not the most elegant, but simplifies code maintenance, and this script shouldnt be used much so...
import miscelaneous as misc

[docs]def add_metabolites(filename):
    """
    Creates "Metabolite" nodes based on XML files obtained from the HMDB website,
    adding some essential identifiers and external properties.

    .. seealso:: This way of working has been taken from
        `William Lyon's Blog <https://lyonwj.com/blog/grandstack-podcast-app-parsing-xml-neo4j-rss-episodes-playlists>`_

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "status"][0]._text AS status,
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "name"][0]._text AS name,
            [X in metabolite._children WHERE X._type = "chemical_formula"][0]._text AS chemical_formula,
            [X in metabolite._children WHERE X._type = "average_molecular_weight"][0]._text AS average_molecular_weight,
            [X in metabolite._children WHERE X._type = "monisotopic_molecular_weight"][0]._text AS monisotopic_molecular_weight,
            [X in metabolite._children WHERE X._type = "iupac_name"][0]._text AS iupac_name,
            [X in metabolite._children WHERE X._type = "cas_registry_number"][0]._text AS cas_registry_number,
            [X in metabolite._children WHERE X._type = "smiles"][0]._text AS smiles,
            [X in metabolite._children WHERE X._type = "inchi"][0]._text AS inchi,
            [X in metabolite._children WHERE X._type = "inchikey"][0]._text AS inchikey,
            [X in metabolite._children WHERE X._type = "state"][0]._text AS state,
            [X in metabolite._children WHERE X._type = "chemspider_id"][0]._text AS chemspider_id,
            [X in metabolite._children WHERE X._type = "drugbank_id"][0]._text AS drugbank_id,
            [X in metabolite._children WHERE X._type = "foodb_id"][0]._text AS foodb_id,
            [X in metabolite._children WHERE X._type = "pubchem_compound_id"][0]._text AS pubchem_compound_id,
            [X in metabolite._children WHERE X._type = "pdb_id"][0]._text AS pdb_id,
            [X in metabolite._children WHERE X._type = "chebi_id"][0]._text AS chebi_id,
            [X in metabolite._children WHERE X._type = "phenol_explorer_compound_id"][0]._text AS phenol_explorer_compound_id,
            [X in metabolite._children WHERE X._type = "knapsack_id"][0]._text AS knapsack_id,
            [X in metabolite._children WHERE X._type = "kegg_id"][0]._text AS kegg_id,
            [X in metabolite._children WHERE X._type = "biocyc_id"][0]._text AS biocyc_id,
            [X in metabolite._children WHERE X._type = "bigg_id"][0]._text AS bigg_id,
            [X in metabolite._children WHERE X._type = "wikipedia_id"][0]._text AS wikipedia_id,
            [X in metabolite._children WHERE X._type = "metlin_id"][0]._text AS metlin_id,
            [X in metabolite._children WHERE X._type = "vmh_id"][0]._text AS vmh_id,
            [X in metabolite._children WHERE X._type = "synthesis_reference"][0]._text AS synthesis_reference,

            [X in metabolite._children WHERE X._type = "secondary_accessions"] AS secondary_accessions,
            [X in metabolite._children WHERE X._type = "synonyms"] AS synonyms

        MERGE (m:Metabolite {{ HMDB_ID:accession }} )
        SET m.Status = status, m.Name = name, m.Formula = chemical_formula,
            m.Average_Mass = average_molecular_weight, m.Monisotopic_Molecular_Weight = monisotopic_molecular_weight,
            m.IUPAC = iupac_name, m.CAS_Number = cas_registry_number, m.SMILES = smiles,
            m.InChI = inchi, m.InChIKey = inchikey, m.State = state, m.ChemSpider_ID = chemspider_id, m.DrugBank_ID = drugbank_id,
            m.FooDB_Compound_ID = foodb_id, m.PubChem_ID = pubchem_compound_id, m.PDB_ID = pdb_id, m.ChEBI_ID = chebi_id,
            m.Phenol_Explorer_Compound_ID = phenol_explorer_compound_id, m.KNApSAcK_ID = knapsack_id, m.KEGG_ID = kegg_id,
            m.Bigg_ID = bigg_id, m.WikiPedia_Article = wikipedia_id, m.METLIN_ID = metlin_id, m.VMH_ID = vmh_id

        WITH split(split(synthesis_reference, ",")[-3], ".")[-2] AS ref_title,
             secondary_accessions, synonyms, synthesis_reference, m

        FOREACH(ignoreMe IN CASE WHEN synthesis_reference IS NOT null THEN [1] ELSE [] END |
            FOREACH(ignoreMe IN CASE WHEN ref_title IS NOT null THEN [1] ELSE [] END |
                MERGE (p:Publication {{ Title: ref_title }})
                SET p.Authors = replace(split(synthesis_reference, ". ")[0], ";",",")
                SET p.Publication = split(split(synthesis_reference, ",")[-3], ".")[-1]
                SET p.Date = replace(split(split(synthesis_reference, ",")[-3], "(")[1], ")", "")
                SET p.Volume = split(split(synthesis_reference, ",")[-2], "(")[0]
                SET p.Issue = replace(split(split(synthesis_reference, ",")[-2], "(")[1], ")", "")
                SET p.Pages = split(synthesis_reference, ",")[-1]

                MERGE (m)-[r:CITED_IN]->(p)
                SET r.Type = "Synthesis"
            )
        )

        WITH secondary_accessions, synonyms, m

        SET m.Synonyms = "", m.Secondary_HMDB_IDs = ""
        FOREACH(element in secondary_accessions|
            FOREACH(accession in element._children|
                SET m.Secondary_HMDB_IDs = accession._text + "," + m.Secondary_HMDB_IDs
            )
        )

        FOREACH(element in synonyms|
            FOREACH(synonym in element._children|
                SET m.Synonyms = synonym._text + "," + m.Synonyms
            )
        )
        """)

[docs]def add_diseases(filename):
    """
    Creates "Publication" nodes based on XML files obtained from the HMDB website.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: Here, an UNWIND clause is used instead of a FOREACH clause. This provides
        better performance, since, unlike FOREACH, UNWIND does not process rows with empty values
        (and, logically, there should be no Publication if there is no Disease)

    .. NOTE:: Publications are created with a (m)-[r:CITED_IN]->(p) relation with Metabolite nodes.
        If one wants to find the Publication nodes related to a given Metabolite/Disease relation,
        one can use:

        .. code-block:: python3

            MATCH p=()-[r:RELATED_WITH]->()
              WITH split(r.PubMed_ID, ",") as pubmed
                UNWIND pubmed as find_this
                MATCH (p:Publication)
                  WHERE p.PubMed_ID = find_this
            RETURN p

    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,

            [X in metabolite._children WHERE X._type = "diseases"] AS diseases

        MERGE (m:Metabolite {{HMDB_ID:accession}})

        WITH diseases, m
        UNWIND diseases AS disease
        UNWIND disease["_children"] AS my_disease
        WITH
            [X in my_disease._children WHERE X._type = "name"][0]._text AS diseasename,
            [X in my_disease._children WHERE X._type = "omin_id"][0]._text AS omim_id,
            [X in my_disease._children WHERE X._type = "references"] AS references,
            m

        UNWIND references as reference
        WITH diseasename, omim_id, reference, m
        UNWIND reference["_children"] AS my_reference
        WITH
            [X in my_reference._children WHERE X._type = "reference_text"][0]._text AS reference_text,
            [X in my_reference._children WHERE X._type = "pubmed_id"][0]._text AS pubmed_id,
            diseasename, omim_id, m

        MERGE (d:Disease {{Name:diseasename }})
        SET d.OMIM_ID = omim_id

        MERGE (m)-[r:ASSOCIATED_DISEASE_METABOLITE]-(d)

        WITH split(replace(reference_text, split(reference_text, ":")[0]+": ", ""), ".")[0] AS ref_title,
             pubmed_id, reference_text, m

        FOREACH(ignoreMe IN CASE WHEN ref_title IS NOT null THEN [1] ELSE [] END |
            MERGE (p:Publication {{ Title: ref_title }})
            SET p.Authors = split(reference_text, ":")[0]


            SET p.Publication = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[0]
            SET p.Notes = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[2]
            SET p.Date = split(split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[1],";")[0]
            SET p.Volume = split(split(reference_text, ";")[1], "(")[0]
            SET p.Issue = split(split(reference_text, "(")[1], ")")[0]
            SET p.Pages = split(split(reference_text, ":")[-1], ".")[0]
            SET p.PubMed_ID = pubmed_id

            MERGE (m)-[r2:CITED_IN]->(p)
            SET r2.PubMed_ID = ""
            SET r2.PubMed_ID = pubmed_id + "," + r2.PubMed_ID
        )
        
        """)

[docs]def add_concentrations_normal(filename):
    """
    Creates "Concentration" nodes based on XML files obtained from the HMDB website.
    In this function, only metabolites that are labeled as "normal_concentration" are added.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: Here, an UNWIND clause is used instead of a FOREACH clause. This provides
        better performance, since, unlike FOREACH, UNWIND does not process rows with empty values

    .. WARNING:: Using the CREATE row forces the creation of a Concentration node, even when
        some values might be missing. However, this means some bogus nodes could be added,
        which MUST be accounted for at the end of the DB-Creation process.
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "normal_concentrations"] AS normal_concentrations

        MERGE (m:Metabolite {{HMDB_ID:accession}})

        WITH normal_concentrations, m
        UNWIND normal_concentrations AS normal_concentration
        WITH normal_concentration, m
        UNWIND normal_concentration["_children"] AS my_concentrations

        WITH
            [X in my_concentrations._children WHERE X._type = "_type"][0]._text AS biospecimen,
            [X in my_concentrations._children WHERE X._type = "concentration_value"][0]._text AS value,
            [X in my_concentrations._children WHERE X._type = "concentration_units"][0]._text AS units,
            [X in my_concentrations._children WHERE X._type = "subject_age"][0]._text AS subject_age,
            [X in my_concentrations._children WHERE X._type = "subject_sex"][0]._text AS subject_sex,
            [X in my_concentrations._children WHERE X._type = "subject_condition"][0]._text AS subject_condition,
            [X in my_concentrations._children WHERE X._type = "comment"][0]._text AS comment,

            [X in my_concentrations._children WHERE X._type = "references"] AS references,
            m

        UNWIND references as reference
        WITH biospecimen, value, units, subject_age, subject_sex, subject_condition, reference,
             comment, m
        UNWIND reference["_children"] AS my_reference
        WITH
            [X in my_reference._children WHERE X._type = "reference_text"][0]._text AS reference_text,
            [X in my_reference._children WHERE X._type = "pubmed_id"][0]._text AS pubmed_id,
            biospecimen, value, units, subject_age, subject_sex, subject_condition, comment,
            m

        CREATE (c:Measurement {{Normal:"True"}})
        SET c.Value = value, c.Comments = comment

        CREATE (sb:Subject)
        SET sb.Age = replace(subject_age, "&gt;", ">"),
            sb.Gender = replace(subject_sex, "Both", "Female + Male"),
            sb.Information = subject_condition

        MERGE (m)-[r5:MEASURED_AS]->(c)
        MERGE (c)-[r7:TAKEN_FROM_SUBJECT]->(sb)

        FOREACH(ignoreMe IN CASE WHEN units IS NOT null THEN [1] ELSE [] END |
            MERGE  (un:Unit {{Name:units}})
            MERGE (c)-[r6:MEASURED_IN]->(un)
            )
        FOREACH(ignoreMe IN CASE WHEN biospecimen IS NOT null THEN [1] ELSE [] END |
            MERGE  (bs:BioSpecimen {{Name:biospecimen}})
            MERGE (c)-[r8:FOUND_IN]->(bs)
            )

        SET r5.PubMed_ID = ""
        SET r5.PubMed_ID = pubmed_id + "," + r5.PubMed_ID

        WITH split(replace(reference_text, split(reference_text, ":")[0]+": ", ""), ".")[0] AS ref_title,
        pubmed_id, reference_text, c

        FOREACH(ignoreMe IN CASE WHEN ref_title IS NOT null THEN [1] ELSE [] END |
            MERGE (p:Publication {{ Title: ref_title }})
            SET p.Authors = split(reference_text, ":")[0]


            SET p.Publication = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[0]
            SET p.Notes = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[2]
            SET p.Date = split(split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[1],";")[0]
            SET p.Volume = split(split(reference_text, ";")[1], "(")[0]
            SET p.Issue = split(split(reference_text, "(")[1], ")")[0]
            SET p.Pages = split(split(reference_text, ":")[-1], ".")[0]
            SET p.PubMed_ID = pubmed_id

            MERGE (c)-[r2:CITED_IN]->(p)
        )

        """)

[docs]def add_concentrations_abnormal(filename):
    """
    Creates "Concentration" nodes based on XML files obtained from the HMDB website.
    In this function, only metabolites that are labeled as "abnormal_concentration" are added.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: Here, an UNWIND clause is used instead of a FOREACH clause. This provides
        better performance, since, unlike FOREACH, UNWIND does not process rows with empty values

    .. WARNING:: Using the CREATE row forces the creation of a Concentration node, even when
        some values might be missing. However, this means some bogus nodes could be added,
        which MUST be accounted for at the end of the DB-Creation process.
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "abnormal_concentrations"] AS abnormal_concentrations

        MERGE (m:Metabolite {{HMDB_ID:accession}})

        WITH abnormal_concentrations, m
        UNWIND abnormal_concentrations AS abnormal_concentration
        WITH abnormal_concentration, m
        UNWIND abnormal_concentration["_children"] AS my_concentrations

        WITH
            [X in my_concentrations._children WHERE X._type = "_type"][0]._text AS biospecimen,
            [X in my_concentrations._children WHERE X._type = "concentration_value"][0]._text AS value,
            [X in my_concentrations._children WHERE X._type = "concentration_units"][0]._text AS units,
            [X in my_concentrations._children WHERE X._type = "patient_age"][0]._text AS patient_age,
            [X in my_concentrations._children WHERE X._type = "patient_sex"][0]._text AS patient_sex,
            [X in my_concentrations._children WHERE X._type = "patient_information"][0]._text AS patient_information,
            [X in my_concentrations._children WHERE X._type = "comment"][0]._text AS comment,

            [X in my_concentrations._children WHERE X._type = "references"] AS references,
            m

        UNWIND references as reference
        WITH biospecimen, value, units, patient_age, patient_sex, patient_information, reference,
             comment, m
        UNWIND reference["_children"] AS my_reference
        WITH
            [X in my_reference._children WHERE X._type = "reference_text"][0]._text AS reference_text,
            [X in my_reference._children WHERE X._type = "pubmed_id"][0]._text AS pubmed_id,
            biospecimen, value, units, patient_age, patient_sex, patient_information, comment, m

        CREATE (c:Measurement {{Normal:"True"}})
        SET c.Value = value, c.Comments = comment

        CREATE (sb:Subject)
        SET sb.Age = replace(patient_age, "&gt;", ">"), sb.Gender = replace(patient_sex, "Both", "Female + Male"), sb.Information = patient_information

        MERGE (m)-[r5:MEASURED_AS]->(c)
        MERGE (c)-[r7:TAKEN_FROM_SUBJECT]->(sb)

        FOREACH(ignoreMe IN CASE WHEN units IS NOT null THEN [1] ELSE [] END |
            MERGE  (un:Unit {{Name:units}})
            MERGE (c)-[r6:MEASURED_IN]->(un)
            )
        FOREACH(ignoreMe IN CASE WHEN biospecimen IS NOT null THEN [1] ELSE [] END |
            MERGE  (bs:BioSpecimen {{Name:biospecimen}})
            MERGE (c)-[r8:FOUND_IN]->(bs)
            )

        SET r5.PubMed_ID = ""
        SET r5.PubMed_ID = pubmed_id + "," + r5.PubMed_ID

        WITH split(replace(reference_text, split(reference_text, ":")[0]+": ", ""), ".")[0] AS ref_title,
            pubmed_id, reference_text, c

        FOREACH(ignoreMe IN CASE WHEN ref_title IS NOT null THEN [1] ELSE [] END |
            MERGE (p:Publication {{ Title: ref_title }})
            SET p.Authors = split(reference_text, ":")[0]


            SET p.Publication = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[0]
            SET p.Notes = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[2]
            SET p.Date = split(split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[1],";")[0]
            SET p.Volume = split(split(reference_text, ";")[1], "(")[0]
            SET p.Issue = split(split(reference_text, "(")[1], ")")[0]
            SET p.Pages = split(split(reference_text, ":")[-1], ".")[0]
            SET p.PubMed_ID = pubmed_id

            MERGE (c)-[r2:CITED_IN]->(p)
        )

        """)

[docs]def add_taxonomy(filename):
    """
    Creates "Taxonomy" nodes based on XML files obtained from the HMDB website.
    These represent the "kind" of metabolite we are dealing with (Family, etc)

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: It only creates relationships in the Kingdom -> Super Class -> Class -> Subclass
        direction, and from any node -> Metabolite. This means that, if any member of the
        Kingdom -> Super Class -> Class -> Subclass is absent, the line will be broken; hopefully
        in that case a new metabolite will come in to rescue and settle the relation!
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "taxonomy"] AS taxonomy

        MERGE (m:Metabolite {{HMDB_ID:accession}})

        WITH taxonomy, m
        UNWIND taxonomy as my_nodes

        WITH
            [X IN my_nodes._children WHERE X._type = "description"][0]._text AS description,
            [X IN my_nodes._children WHERE X._type = "direct_parent"][0]._text AS direct_parent,
            [X IN my_nodes._children WHERE X._type = "kingdom"][0]._text AS kingdom,
            [X IN my_nodes._children WHERE X._type = "super_class"][0]._text AS super_class,
            [X IN my_nodes._children WHERE X._type = "class"][0]._text AS class,
            [X IN my_nodes._children WHERE X._type = "sub_class"][0]._text AS sub_class,

            [X IN my_nodes._children WHERE X._type = "alternative_parents"] AS alternative_parents,
            [X IN my_nodes._children WHERE X._type = "substituents"] AS substituents,
            [X IN my_nodes._children WHERE X._type = "external_descriptors"] AS external_descriptors,
            m

        SET m.Description = apoc.text.capitalize(description)

        // First, we create the Taxonomy nodes independently

        FOREACH(ignoreMe IN CASE WHEN kingdom IS NOT null THEN [1] ELSE [] END |
            MERGE (k:Taxonomy {{Type:"Kingdom", Name:kingdom}})
        )
        FOREACH(ignoreMe IN CASE WHEN super_class IS NOT null THEN [1] ELSE [] END |
            MERGE (sp:Taxonomy {{Type:"Super Class", Name:super_class}})
        )
        FOREACH(ignoreMe IN CASE WHEN class IS NOT null THEN [1] ELSE [] END |
            MERGE (c:Taxonomy {{Type:"Class", Name:class}})
        )
        FOREACH(ignoreMe IN CASE WHEN sub_class IS NOT null THEN [1] ELSE [] END |
            MERGE (sb:Taxonomy {{Type:"Sub Class", Name:sub_class}})
        )
        FOREACH(ignoreMe IN CASE WHEN direct_parent IS NOT null THEN [1] ELSE [] END |
            MERGE (dp:Taxonomy {{Name:direct_parent}})
            MERGE (m)-[:PART_OF_CLADE]->(dp)
        )

        // Then, we add a hierarchy connecting the nodes as much as possible between them

        FOREACH(ignoreMe IN CASE WHEN kingdom IS NOT null AND super_class IS NOT null THEN [1] ELSE [] END |
            MERGE (k:Taxonomy {{ Type:"Kingdom", Name:kingdom }})
            MERGE (sp:Taxonomy {{ Type:"Super Class", Name:super_class }})
            MERGE (k)-[:PART_OF_CLADE]->(sp)
        )
        FOREACH(ignoreMe IN CASE WHEN class IS NOT null AND super_class IS NOT null THEN [1] ELSE [] END |
            MERGE (c:Taxonomy {{ Type:"Class", Name:class }})
            MERGE (sp:Taxonomy {{ Type:"Super Class", Name:super_class }})
            MERGE (sp)-[:PART_OF_CLADE]->(c)
        )
        FOREACH(ignoreMe IN CASE WHEN sub_class IS NOT null AND class IS NOT null THEN [1] ELSE [] END |
            MERGE (c:Taxonomy {{ Type:"Class", Name:class }})
            MERGE (sb:Taxonomy {{ Type:"Sub Class", Name:sub_class }})
            MERGE (sb)-[:PART_OF_CLADE]->(c)
        )

        // And we connect the hierarchy to the main node just once

        FOREACH(ignoreMe IN CASE WHEN sub_class IS NOT null THEN [1] ELSE [] END |
            MERGE (ta:Taxonomy {{ Name:sub_class }})
            MERGE (m)-[:PART_OF_CLADE]->(ta)
        )
        FOREACH(ignoreMe IN CASE WHEN class IS NOT null
                AND sub_class IS null THEN [1] ELSE [] END |
            MERGE (ta:Taxonomy {{ Name:class }})
            MERGE (m)-[:PART_OF_CLADE]->(ta)
        )
        FOREACH(ignoreMe IN CASE WHEN super_class IS NOT null
                AND sub_class IS null AND class IS null THEN [1] ELSE [] END |
            MERGE (ta:Taxonomy {{ Name:super_class }})
            MERGE (m)-[:PART_OF_CLADE]->(ta)
        )
        FOREACH(ignoreMe IN CASE WHEN kingdom IS NOT null AND sub_class IS null
                AND class IS null AND super_class IS null  THEN [1] ELSE [] END |
            MERGE (ta:Taxonomy {{ Name:kingdom }})
            MERGE (m)-[:PART_OF_CLADE]->(ta)
        )

        // We add the alternative_parents in the appropriate format

        FOREACH(element in alternative_parents|
            FOREACH(taxonomy in element._children|
                MERGE (t:Taxonomy {{Name:taxonomy._text}})
                MERGE (m)-[:PART_OF_CLADE]->(t)
            )
        )

        // If any Taxonomy is left without a connection, we connect it to the main graph
        // Beware: if any disconnected taxonomy is left from before, this could lead to errors

        WITH m, alternative_parents
        MATCH (tt:Taxonomy) WHERE NOT (tt)--()
        MERGE (m)-[:PART_OF_CLADE]->(tt)
        """)

[docs]def add_experimental_properties(filename):
    """
    Adds properties to existing "Metabolite" nodes based on XML files obtained from the HMDB website.
    In this case, only properties labeled as <experimental_properties> are added.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: Another option would have been to auto-add all the properties, and name them using
        RETURN "Experimental " + apoc.text.capitalizeAll(replace(kind, "_", " ")), value; however, this
        way we can select and not duplicate / overwrite values.

    .. TODO:: It would be nice to be able to distinguish between experimental and predicted properties
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "experimental_properties"] AS experimental_properties

        UNWIND experimental_properties as experimental_property
        WITH experimental_property, accession
        UNWIND experimental_property["_children"] AS my_property
        WITH my_property, accession
        WITH
            [X in my_property._children WHERE X._type = "kind"][0]._text AS kind,
            [X in my_property._children WHERE X._type = "value"][0]._text AS value,
            accession

        MERGE (m:Metabolite {{HMDB_ID:accession}})

        WITH apoc.map.fromLists(collect(kind), collect(value)) AS dict, m
        SET m.Water_Solubility = dict["water_solubility"], m.logP = dict["logp"],
             m.Melting_Point = dict["melting_point"], m.Boiling_Point = dict["boiling_point"]
        """)

[docs]def add_predicted_properties(filename):
    """
    Adds properties to existing "Metabolite" nodes based on XML files obtained from the HMDB website.
    In this case, only properties labeled as <predicted_properties> are added.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: Another option would have been to auto-add all the properties, and name them using
        RETURN "Predicted " + apoc.text.capitalizeAll(replace(kind, "_", " ")), value; however, this
        way we can select and not duplicate / overwrite values.

    .. TODO:: It would be nice to be able to distinguish between experimental and predicted properties
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "predicted_properties"] AS predicted_properties

        UNWIND predicted_properties as predicted_property
        WITH predicted_property, accession
        UNWIND predicted_property["_children"] AS my_property
        WITH my_property, accession
        WITH
            [X in my_property._children WHERE X._type = "kind"][0]._text AS kind,
            [X in my_property._children WHERE X._type = "value"][0]._text AS value,
            accession

        MERGE (m:Metabolite {{HMDB_ID:accession}})

        WITH apoc.map.fromLists(collect(kind), collect(value)) AS dict, m
        SET m.Bioavailability = dict["bioavailability"],
            m.Donor_Count = dict["donor_count"],  m.Polar_Surface_Area = dict["polar_surface_area"],
            m.Ro5 = dict["rule_of_five"], m.pKa_Strongest_Acidic = dict["pka_strongest_acidic"],
            m.pKa_Strongest_Basic = dict["pka_strongest_basic"], m.Number_of_Rings = dict["number_of_rings"],
            m.Physiological_Charge = dict["physiological_charge"], m.Polarizability = dict["polarizability"],
            m.logS = dict["logs"], m.MDDR_Like_Rule = dict["mddr_like_rule"],
            m.Ghose_Filter = dict["ghose_filter"], m.Refractivity = dict["refractivity"],
            m.Rotatable_Bond_Count = dict["rotatable_bond_count"], m.Acceptor_Count = dict["acceptor_count"],
            m.Formal_Charge = dict["formal_charge"], m.Verber_Rule = dict["veber_rule"]
        """)

[docs]def add_biological_properties(filename):
    """
    Adds biological properties to existing "Metabolite" nodes based on XML files obtained from the HMDB website.
    In this case, only properties labeled as <predicted_properties> are added.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: Another option would have been to auto-add all the properties, and name them using
        RETURN "Predicted " + apoc.text.capitalizeAll(replace(kind, "_", " ")), value; however, this
        way we can select and not duplicate / overwrite values.

    .. TODO:: It would be nice to be able to distinguish between experimental and predicted properties
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "biological_properties"] AS biological_properties

        MERGE (m:Metabolite {{HMDB_ID:accession}})

        WITH biological_properties, m
        UNWIND biological_properties AS biological_property
        WITH biological_property, m
        UNWIND biological_property["_children"] AS my_property

        WITH
            [X IN my_property._children WHERE X._type = "cellular"] AS cellulars,
            [X IN my_property._children WHERE X._type = "biospecimen"] AS biospecimens,
            [X IN my_property._children WHERE X._type = "tissue"] AS tissues,
            [X IN my_property._children WHERE X._type = "pathway"] AS pathways,
            m

        FOREACH(location IN cellulars|
            MERGE (c:CelularLocation {{Name:location._text}})
            MERGE (m)-[r:LOCATED_INSIDE_CELL]->(c)
        )
        FOREACH(location IN biospecimens|
            MERGE (b:BioSpecimen {{Name:location._text}})
            MERGE (m)-[r:LOCATED_IN_BIOSPECIMEN]->(b)
        )
        FOREACH(location IN tissues|
            MERGE (t:Tissue {{Name:location._text}})
            MERGE (m)-[r:LOCATED_IN_TISSUE]->(t)
        )
        WITH pathways, m
        UNWIND pathways as my_pathways
        WITH my_pathways, m
        WITH
            [X in my_pathways._children WHERE X._type = "name"][0]._text AS name,
            [X in my_pathways._children WHERE X._type = "smpdb_id"][0]._text AS smpdb_id,
            [X in my_pathways._children WHERE X._type = "kegg_map_id"][0]._text AS kegg_map_id,
            m

        MERGE (p:Pathway {{Name:name}})
        SET p.SMPDB_ID = smpdb_id, p.KEGG_ID = kegg_map_id
        MERGE (m)-[r:PART_OF_PATHWAY]->(p)
        """)

[docs]def add_proteins(filename):
    """
    Creates "Protein" nodes based on XML files obtained from the HMDB website.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: We are not creating "Gene" nodes (even though each protein comes from a given gene)
        because we believe not enough information is being given about them.
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "protein"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "name"][0]._text AS name,
            [X in metabolite._children WHERE X._type = "protein_type"][0]._text AS protein_type,
            [X in metabolite._children WHERE X._type = "gene_name"][0]._text AS gene_name,
            [X in metabolite._children WHERE X._type = "general_function"][0]._text AS general_function,
            [X in metabolite._children WHERE X._type = "specific_function"][0]._text AS specific_function,
            [X in metabolite._children WHERE X._type = "genbank_protein_id"][0]._text AS genbank_protein_id,
            [X in metabolite._children WHERE X._type = "uniprot_id"][0]._text AS uniprot_id,
            [X in metabolite._children WHERE X._type = "uniprot_name"][0]._text AS uniprot_name,
            [X in metabolite._children WHERE X._type = "genbank_gene_id"][0]._text AS genbank_gene_id,
            [X in metabolite._children WHERE X._type = "genecard_id"][0]._text AS genecard_id,
            [X in metabolite._children WHERE X._type = "geneatlas_id"][0]._text AS geneatlas_id,
            [X in metabolite._children WHERE X._type = "hgnc_id"][0]._text AS hgnc_id,

            [X in metabolite._children WHERE X._type = "subcellular_locations"] AS subcellular_locations,
            [X in metabolite._children WHERE X._type = "secondary_accessions"] AS secondary_accessions,
            [X in metabolite._children WHERE X._type = "pdb_ids"] AS pdb_ids,
            [X in metabolite._children WHERE X._type = "synonyms"] AS synonyms

        MERGE (p:Protein {{ HMDB_ID:accession }} )
        SET p.Name = name, p.UniProt_ID = uniprot_id,
            p.Function = protein_type, p.Gene_Name = gene_name, p.Function = general_function,
            p.Specific_Function = specific_function, p.Genbank_Protein_ID = genbank_protein_id,
            p.GenBank_Gene_ID = genbank_gene_id, p.GeneCards_ID = genecard_id,
            p.GenAtlas_ID = geneatlas_id, p.HGNC_ID = hgnc_id

        WITH secondary_accessions, synonyms, pdb_ids, subcellular_locations, p

        FOREACH(element in subcellular_locations|
            FOREACH(location in element._children|
                MERGE (c:CelularLocation {{Name:location._text}})
                MERGE (p)-[r:LOCATED_INSIDE_CELL]->(c)
            )
        )

        SET p.Synonyms = "", p.Secondary_HMDB_IDs = "", p.PDB_ID = ""
        FOREACH(element in secondary_accessions|
            FOREACH(accession in element._children|
                SET p.Secondary_HMDB_IDs = accession._text + "," + p.Secondary_HMDB_IDs
            )
        )
        FOREACH(element in synonyms|
            FOREACH(synonym in element._children|
                SET p.Synonyms = synonym._text + "," + p.Synonyms
            )
        )
        FOREACH(element in pdb_ids|
            FOREACH(pdb in element._children|
                SET p.PDB_ID = pdb._text + "," + p.PDB_ID
            )
        )
        """)

[docs]def add_go_classifications(filename):
    """
    Creates "Gene Ontology" nodes based on XML files obtained from the HMDB website.
    This relates each protein to some GO-Terms

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "protein"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "go_classifications"] AS go_classifications

        MERGE (p:Protein {{ HMDB_ID:accession }})

        WITH go_classifications, p
        UNWIND go_classifications AS go_class
        WITH go_class, p
        UNWIND go_class["_children"] AS my_class

        WITH
            [X in my_class._children WHERE X._type = "category"][0]._text AS category,
            [X in my_class._children WHERE X._type = "description"][0]._text AS description,
            [X in my_class._children WHERE X._type = "go_id"][0]._text AS go_id,
            p

        MERGE (g:GeneOntology {{ Description:description }})
        SET g.GO_ID = go_id, g.Category = category

        MERGE (p)-[r:PART_OF_GENE_ONTOLOGY]-(g)
        """)

[docs]def add_gene_properties(filename):
    """
    Adds some properties to existing "Protein" nodes based on XML files obtained from the HMDB website.
    In this case, properties will mostly relate to the gene from which the protein originates.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: We are not creating "Gene" nodes (even though each protein comes from a given gene)
        because we believe not enough information is being given about them.
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "protein"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "gene_properties"] AS gene_properties

        MERGE (p:Protein {{ HMDB_ID:accession }})

        WITH gene_properties, p
        UNWIND gene_properties AS gene_property

        WITH
            [X in gene_property._children WHERE X._type = "chromosome_location"][0]._text AS chromosome_location,
            [X in gene_property._children WHERE X._type = "locus"][0]._text AS locus,
            [X in gene_property._children WHERE X._type = "gene_sequence"][0]._text AS gene_sequence,
            p

        WITH
            replace(replace(gene_sequence, split(gene_sequence, "bp")[0]+"bp", ""), " ", "") as SEQUENCE,
            chromosome_location, locus, gene_sequence, p

        FOREACH(ignoreMe IN CASE WHEN SEQUENCE IS NOT null AND SEQUENCE <> "" THEN [1] ELSE [] END |
            MERGE (se:Sequence {{ Sequence:SEQUENCE }} )
            SET se.Type= "DNA", se.Chromosome_Location = chromosome_location, se.Locus = locus
            MERGE (p)-[r:SEQUENCED_AS]->(se)
        )

        """)

[docs]def add_protein_properties(filename):
    """
    Adds some properties to existing "Protein" nodes based on XML files obtained from the HMDB website.
    In this case, properties will mostly relate to the protein itself.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: The "signal_regions" and the "transmembrane_regions" properties were left out
        because, after a preliminary search, they were mostly empty
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "protein"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "protein_properties"] AS protein_properties,
            [X in metabolite._children WHERE X._type = "uniprot_id"][0]._text AS uniprot_id

        MERGE (p:Protein {{ HMDB_ID:accession }})

        WITH protein_properties, p, uniprot_id
        UNWIND protein_properties AS protein_property

        WITH
            [X in protein_property._children WHERE X._type = "residue_number"][0]._text AS residue_number,
            [X in protein_property._children WHERE X._type = "molecular_weight"][0]._text AS molecular_weight,
            [X in protein_property._children WHERE X._type = "theoretical_pi"][0]._text AS theoretical_pi,
            [X in protein_property._children WHERE X._type = "pfams"] AS pfams,
            [X in protein_property._children WHERE X._type = "polypeptide_sequence"][0]._text AS polypeptide_sequence,
            p, uniprot_id

        SET p.Residue_Number = residue_number, p.Molecular_Weight = molecular_weight,
            p.Theoretical_PI = theoretical_pi

        FOREACH(ignoreMe IN CASE WHEN polypeptide_sequence IS NOT null AND polypeptide_sequence <> "" THEN [1] ELSE [] END |
            MERGE (se:Sequence {{ Sequence: polypeptide_sequence }} )
            SET se.Type= "PROT", se.UniProt_ID = uniprot_id
            MERGE (p)-[r:SEQUENCED_AS]->(se)
        )


        WITH p, pfams
        UNWIND pfams AS pfam
        WITH p, pfam
        UNWIND pfam["_children"] AS my_pfam

        WITH
            [X in my_pfam._children WHERE X._type = "name"][0]._text AS name,
            [X in my_pfam._children WHERE X._type = "pfam_id"][0]._text AS pfam_id,
            p

        MERGE (pf:PFam {{ PFAM_ID:pfam_id }})
        SET pf.Name = name
        MERGE (p)-[r:PART_OF_PFAM]->(pf)

        """)

[docs]def add_general_references(filename, type_of):
    """
    Creates "Publication" nodes based on XML files obtained from the HMDB website.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: Since not all nodes present a "PubMed_ID" field (which would be ideal to uniquely-identify
        Publications, as the "Text" field is way more prone to typos/errors), nodes will be created using
        the "Authors" field. This means some duplicates might exist, which should be accounted for.

    .. NOTE:: Unlike the rest, here we are not matching metabolites, but ALSO proteins. This is intentional.
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "{type_of}"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "general_references"] AS general_references

        MATCH (m) WHERE (m:Metabolite OR m:Protein) AND m.HMDB_ID = accession

        WITH general_references, m
        UNWIND general_references AS general_reference
        WITH general_reference, m
        UNWIND general_reference["_children"] AS my_reference

        WITH
            [X in my_reference._children WHERE X._type = "reference_text"][0]._text AS reference_text,
            [X in my_reference._children WHERE X._type = "pubmed_id"][0]._text AS pubmed_id,
            m

        WITH split(replace(reference_text, split(reference_text, ":")[0]+": ", ""), ".")[0] AS ref_title,
             pubmed_id, reference_text, m
        FOREACH(ignoreMe IN CASE WHEN ref_title IS NOT null THEN [1] ELSE [] END |
            MERGE (p:Publication {{ Title: ref_title }})
            SET p.Authors = split(reference_text, ":")[0]


            SET p.Publication = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[0]
            SET p.Notes = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[2]
            SET p.Date = split(split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[1],";")[0]
            SET p.Volume = split(split(reference_text, ";")[1], "(")[0]
            SET p.Issue = split(split(reference_text, "(")[1], ")")[0]
            SET p.Pages = split(split(reference_text, ":")[-1], ".")[0]
            SET p.PubMed_ID = pubmed_id

            MERGE (m)-[r:CITED_IN]->(p)
        )

        """)

[docs]def add_protein_associations(filename):
    """
    Creates "Protein" nodes based on XML files obtained from the HMDB website.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: Unlike the "add_protein" function, this creates Proteins based on info on the
        "Metabolite" files, not on the "Protein" files themselves. This could mean node duplication, but,
        hopefully, the MERGE by Accession will mean that this duplicates will be catched.
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "protein_associations"] AS protein_associations

        MERGE (m:Metabolite {{HMDB_ID:accession}})

        WITH protein_associations, m
        UNWIND protein_associations AS protein_association
        WITH protein_association, m
        UNWIND protein_association["_children"] AS my_protein

        WITH
            [X in my_protein._children WHERE X._type = "protein_accession"][0]._text AS protein_accession,
            [X in my_protein._children WHERE X._type = "name"][0]._text AS name,
            [X in my_protein._children WHERE X._type = "uniprot_id"][0]._text AS uniprot_id,
            [X in my_protein._children WHERE X._type = "gene_name"][0]._text AS gene_name,
            [X in my_protein._children WHERE X._type = "protein_type"][0]._text AS protein_type,
            m

        MERGE (p:Protein {{ HMDB_ID:protein_accession }})
        ON CREATE SET p.Gene_Name = gene_name, p.Function = protein_type,
                      p.UniProt_ID = uniprot_id, p.Name = name

        MERGE (m)-[r:INTERACTS_WITH]-(p)
        """)

[docs]def add_metabolite_associations(filename):
    """
    Adds associations contained in the "protein" file, between proteins and metabolites.

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: Like he "add_metabolite_associations" function, this creates non-directional
        relationships (m)-[r:ASSOCIATED_WITH]-(p) ; this helps duplicates be detected.

    .. NOTE:: The "ON CREATE SET" clause for the "Name" param ensures no overwriting
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "protein"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "metabolite_associations"] AS metabolite_associations

        MERGE (p:Protein {{ HMDB_ID:accession }})

        WITH metabolite_associations, p
        UNWIND metabolite_associations AS metabolite_association
        WITH metabolite_association, p
        UNWIND metabolite_association["_children"] AS my_metabolite

        WITH
            [X in my_metabolite._children WHERE X._type = "accession"][0]._text AS metabolite_accession,
            [X in my_metabolite._children WHERE X._type = "name"][0]._text AS name,
            p

        MERGE (m:Metabolite {{ HMDB_ID:metabolite_accession }})
        ON CREATE SET m.Name = name

        MERGE (m)-[r:INTERACTS_WITH]-(p)
        """)

[docs]def add_metabolite_references(filename):
    """
    Creates references for relations betweens Protein nodes and Metabolite nodes

    Args:
        tx          (neo4j.Session): The session under which the driver is running
        filename    (str): The name of the XML file that is being imported

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. WARNING:: Unfortunately, Neo4J makes it really, really, really difficult to work with XML,
        and so, this time, a r.PubMed_ID list with the references could not be created. Nonetheless,
        I considered adding this useful.
    """
    return (f"""
        CALL apoc.load.xml("file:///{filename}")
        YIELD value
        WITH [x in value._children WHERE x._type = "protein"] AS metabolites
        UNWIND metabolites AS metabolite
        WITH
            [X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
            [X in metabolite._children WHERE X._type = "metabolite_references"] AS metabolite_references

        MERGE (p:Protein {{ HMDB_ID:accession }})

        WITH metabolite_references, p
        UNWIND metabolite_references AS metabolite_reference
        WITH metabolite_reference, p
        UNWIND metabolite_reference["_children"] AS my_reference
        WITH my_reference, p
        UNWIND my_reference["_children"] AS my_ref

        WITH
            [X in my_ref._children WHERE X._type = "accession"][0]._text AS metabolite_accession,
            [X in my_ref._children WHERE X._type = "name"][0]._text AS name,
            [X in my_ref._children WHERE X._type = "reference_text"][0]._text AS reference_text,
            [X in my_ref._children WHERE X._type = "pubmed_id"][0]._text AS pubmed_id,
            p

        FOREACH(ignoreMe IN CASE WHEN metabolite_accession IS NOT null THEN [1] ELSE [] END |
            MERGE (m:Metabolite {{ HMDB_ID:metabolite_accession }})
            ON CREATE SET m.name = name
            MERGE (m)-[r:INTERACTS_WITH]-(p)
            )

        WITH split(replace(reference_text, split(reference_text, ":")[0]+": ", ""), ".")[0] AS ref_title,
             pubmed_id, reference_text, p
        FOREACH(ignoreMe IN CASE WHEN ref_title IS NOT null THEN [1] ELSE [] END |
            MERGE (pu:Publication {{ Title: ref_title }})
            SET pu.Authors = split(reference_text, ":")[0]

            SET pu.Publication = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[0]
            SET pu.Notes = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[2]
            SET pu.Date = split(split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[1],";")[0]
            SET pu.Volume = split(split(reference_text, ";")[1], "(")[0]
            SET pu.Issue = split(split(reference_text, "(")[1], ")")[0]
            SET pu.Pages = split(split(reference_text, ":")[-1], ".")[0]
            SET pu.PubMed_ID = pubmed_id

            MERGE (p)-[r:CITED_IN]->(pu)
        )

        """)

[docs]def build_from_protein_file(newfile, driver):
    """
    A function able to build a portion of the HMDB database in graph format, provided that one "Protein" XML is supplied to it.
    This are downloaded separately from the website, as ```hmdb_proteins.zip```, and can be presented either as the full file,
    or as a splitted version of it, with just one item per file (which is recommended due to memory limitations)

    Args:
        newfile         (str): The path of the XML file to import
        driver (neo4j.Driver): Neo4J's Bolt Driver currently in use

    Returns:
        This function modifies the Neo4J Database as desired, but does not produce any particular return.
    """
    misc.manage_transaction(add_proteins(newfile), driver)
    misc.manage_transaction(add_go_classifications(newfile), driver)
    misc.manage_transaction(add_gene_properties(newfile), driver)
    misc.manage_transaction(add_protein_properties(newfile), driver)
    misc.manage_transaction(add_metabolite_associations(newfile), driver)
    misc.manage_transaction(add_metabolite_references(newfile), driver)
    misc.manage_transaction(add_general_references(newfile, "protein"), driver)


[docs]def build_from_metabolite_file(newfile, driver):
    """
    A function able to build a portion of the HMDB database in graph format, provided that one "Metabolite" XML is supplied to it.
    This are downloaded separately from the website, as all the files that are not ```hmdb_proteins.zip```, and can be presented either
    as the full file, or as a splitted version of it, with just one item per file (which is recommended due to memory limitations)

    Args:
        newfile         (str): The path of the XML file to import
        driver (neo4j.Driver): Neo4J's Bolt Driver currently in use

    Returns:
        This function modifies the Neo4J Database as desired, but does not produce any particular return.
    """
    misc.manage_transaction(add_metabolites(newfile), driver)
    misc.manage_transaction(add_protein_associations(newfile), driver)
    misc.manage_transaction(add_diseases(newfile), driver)
    misc.manage_transaction(add_concentrations_normal(newfile), driver)
    misc.manage_transaction(add_concentrations_abnormal(newfile), driver)
    misc.manage_transaction(add_taxonomy(newfile), driver)
    misc.manage_transaction(add_biological_properties(newfile), driver)
    misc.manage_transaction(add_experimental_properties(newfile), driver)
    misc.manage_transaction(add_predicted_properties(newfile), driver)
    misc.manage_transaction(add_general_references(newfile, "metabolite"), driver)