#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2022 Pablo Marcos <software@loreak.org>
#
# SPDX-License-Identifier: MIT
"""
A python module that provides the necessary functions to transition the HMDB database to graph format,
either from scratch importing all the nodes (as showcased in :obj:`CanGraph.GraphifyHMDB.main`) or in a case-by-case basis,
to annotate existing metabolites (as showcased in :obj:`CanGraph.main`).
"""
# Import external modules necessary for the script
from alive_progress import alive_bar # A cute progress bar that shows the script is still running
import os, sys, shutil # Vital modules to interact with the filesystem
from time import sleep # A hack to avoid starving the system resources
# Import subscripts for the program
# This hack that allows us to de-duplicate the miscleaneous script in this less-used script
sys.path.append("../")
# .. NOTE:: Please beware that, if using this module by itself, you might need to copy "miscelaneous.py" into your path
# This is not the most elegant, but simplifies code maintenance, and this script shouldnt be used much so...
import miscelaneous as misc
[docs]def add_diseases(filename):
"""
Creates "Publication" nodes based on XML files obtained from the HMDB website.
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: Here, an UNWIND clause is used instead of a FOREACH clause. This provides
better performance, since, unlike FOREACH, UNWIND does not process rows with empty values
(and, logically, there should be no Publication if there is no Disease)
.. NOTE:: Publications are created with a (m)-[r:CITED_IN]->(p) relation with Metabolite nodes.
If one wants to find the Publication nodes related to a given Metabolite/Disease relation,
one can use:
.. code-block:: python3
MATCH p=()-[r:RELATED_WITH]->()
WITH split(r.PubMed_ID, ",") as pubmed
UNWIND pubmed as find_this
MATCH (p:Publication)
WHERE p.PubMed_ID = find_this
RETURN p
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "diseases"] AS diseases
MERGE (m:Metabolite {{HMDB_ID:accession}})
WITH diseases, m
UNWIND diseases AS disease
UNWIND disease["_children"] AS my_disease
WITH
[X in my_disease._children WHERE X._type = "name"][0]._text AS diseasename,
[X in my_disease._children WHERE X._type = "omin_id"][0]._text AS omim_id,
[X in my_disease._children WHERE X._type = "references"] AS references,
m
UNWIND references as reference
WITH diseasename, omim_id, reference, m
UNWIND reference["_children"] AS my_reference
WITH
[X in my_reference._children WHERE X._type = "reference_text"][0]._text AS reference_text,
[X in my_reference._children WHERE X._type = "pubmed_id"][0]._text AS pubmed_id,
diseasename, omim_id, m
MERGE (d:Disease {{Name:diseasename }})
SET d.OMIM_ID = omim_id
MERGE (m)-[r:ASSOCIATED_DISEASE_METABOLITE]-(d)
WITH split(replace(reference_text, split(reference_text, ":")[0]+": ", ""), ".")[0] AS ref_title,
pubmed_id, reference_text, m
FOREACH(ignoreMe IN CASE WHEN ref_title IS NOT null THEN [1] ELSE [] END |
MERGE (p:Publication {{ Title: ref_title }})
SET p.Authors = split(reference_text, ":")[0]
SET p.Publication = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[0]
SET p.Notes = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[2]
SET p.Date = split(split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[1],";")[0]
SET p.Volume = split(split(reference_text, ";")[1], "(")[0]
SET p.Issue = split(split(reference_text, "(")[1], ")")[0]
SET p.Pages = split(split(reference_text, ":")[-1], ".")[0]
SET p.PubMed_ID = pubmed_id
MERGE (m)-[r2:CITED_IN]->(p)
SET r2.PubMed_ID = ""
SET r2.PubMed_ID = pubmed_id + "," + r2.PubMed_ID
)
""")
[docs]def add_concentrations_normal(filename):
"""
Creates "Concentration" nodes based on XML files obtained from the HMDB website.
In this function, only metabolites that are labeled as "normal_concentration" are added.
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: Here, an UNWIND clause is used instead of a FOREACH clause. This provides
better performance, since, unlike FOREACH, UNWIND does not process rows with empty values
.. WARNING:: Using the CREATE row forces the creation of a Concentration node, even when
some values might be missing. However, this means some bogus nodes could be added,
which MUST be accounted for at the end of the DB-Creation process.
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "normal_concentrations"] AS normal_concentrations
MERGE (m:Metabolite {{HMDB_ID:accession}})
WITH normal_concentrations, m
UNWIND normal_concentrations AS normal_concentration
WITH normal_concentration, m
UNWIND normal_concentration["_children"] AS my_concentrations
WITH
[X in my_concentrations._children WHERE X._type = "_type"][0]._text AS biospecimen,
[X in my_concentrations._children WHERE X._type = "concentration_value"][0]._text AS value,
[X in my_concentrations._children WHERE X._type = "concentration_units"][0]._text AS units,
[X in my_concentrations._children WHERE X._type = "subject_age"][0]._text AS subject_age,
[X in my_concentrations._children WHERE X._type = "subject_sex"][0]._text AS subject_sex,
[X in my_concentrations._children WHERE X._type = "subject_condition"][0]._text AS subject_condition,
[X in my_concentrations._children WHERE X._type = "comment"][0]._text AS comment,
[X in my_concentrations._children WHERE X._type = "references"] AS references,
m
UNWIND references as reference
WITH biospecimen, value, units, subject_age, subject_sex, subject_condition, reference,
comment, m
UNWIND reference["_children"] AS my_reference
WITH
[X in my_reference._children WHERE X._type = "reference_text"][0]._text AS reference_text,
[X in my_reference._children WHERE X._type = "pubmed_id"][0]._text AS pubmed_id,
biospecimen, value, units, subject_age, subject_sex, subject_condition, comment,
m
CREATE (c:Measurement {{Normal:"True"}})
SET c.Value = value, c.Comments = comment
CREATE (sb:Subject)
SET sb.Age = replace(subject_age, ">", ">"),
sb.Gender = replace(subject_sex, "Both", "Female + Male"),
sb.Information = subject_condition
MERGE (m)-[r5:MEASURED_AS]->(c)
MERGE (c)-[r7:TAKEN_FROM_SUBJECT]->(sb)
FOREACH(ignoreMe IN CASE WHEN units IS NOT null THEN [1] ELSE [] END |
MERGE (un:Unit {{Name:units}})
MERGE (c)-[r6:MEASURED_IN]->(un)
)
FOREACH(ignoreMe IN CASE WHEN biospecimen IS NOT null THEN [1] ELSE [] END |
MERGE (bs:BioSpecimen {{Name:biospecimen}})
MERGE (c)-[r8:FOUND_IN]->(bs)
)
SET r5.PubMed_ID = ""
SET r5.PubMed_ID = pubmed_id + "," + r5.PubMed_ID
WITH split(replace(reference_text, split(reference_text, ":")[0]+": ", ""), ".")[0] AS ref_title,
pubmed_id, reference_text, c
FOREACH(ignoreMe IN CASE WHEN ref_title IS NOT null THEN [1] ELSE [] END |
MERGE (p:Publication {{ Title: ref_title }})
SET p.Authors = split(reference_text, ":")[0]
SET p.Publication = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[0]
SET p.Notes = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[2]
SET p.Date = split(split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[1],";")[0]
SET p.Volume = split(split(reference_text, ";")[1], "(")[0]
SET p.Issue = split(split(reference_text, "(")[1], ")")[0]
SET p.Pages = split(split(reference_text, ":")[-1], ".")[0]
SET p.PubMed_ID = pubmed_id
MERGE (c)-[r2:CITED_IN]->(p)
)
""")
[docs]def add_concentrations_abnormal(filename):
"""
Creates "Concentration" nodes based on XML files obtained from the HMDB website.
In this function, only metabolites that are labeled as "abnormal_concentration" are added.
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: Here, an UNWIND clause is used instead of a FOREACH clause. This provides
better performance, since, unlike FOREACH, UNWIND does not process rows with empty values
.. WARNING:: Using the CREATE row forces the creation of a Concentration node, even when
some values might be missing. However, this means some bogus nodes could be added,
which MUST be accounted for at the end of the DB-Creation process.
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "abnormal_concentrations"] AS abnormal_concentrations
MERGE (m:Metabolite {{HMDB_ID:accession}})
WITH abnormal_concentrations, m
UNWIND abnormal_concentrations AS abnormal_concentration
WITH abnormal_concentration, m
UNWIND abnormal_concentration["_children"] AS my_concentrations
WITH
[X in my_concentrations._children WHERE X._type = "_type"][0]._text AS biospecimen,
[X in my_concentrations._children WHERE X._type = "concentration_value"][0]._text AS value,
[X in my_concentrations._children WHERE X._type = "concentration_units"][0]._text AS units,
[X in my_concentrations._children WHERE X._type = "patient_age"][0]._text AS patient_age,
[X in my_concentrations._children WHERE X._type = "patient_sex"][0]._text AS patient_sex,
[X in my_concentrations._children WHERE X._type = "patient_information"][0]._text AS patient_information,
[X in my_concentrations._children WHERE X._type = "comment"][0]._text AS comment,
[X in my_concentrations._children WHERE X._type = "references"] AS references,
m
UNWIND references as reference
WITH biospecimen, value, units, patient_age, patient_sex, patient_information, reference,
comment, m
UNWIND reference["_children"] AS my_reference
WITH
[X in my_reference._children WHERE X._type = "reference_text"][0]._text AS reference_text,
[X in my_reference._children WHERE X._type = "pubmed_id"][0]._text AS pubmed_id,
biospecimen, value, units, patient_age, patient_sex, patient_information, comment, m
CREATE (c:Measurement {{Normal:"True"}})
SET c.Value = value, c.Comments = comment
CREATE (sb:Subject)
SET sb.Age = replace(patient_age, ">", ">"), sb.Gender = replace(patient_sex, "Both", "Female + Male"), sb.Information = patient_information
MERGE (m)-[r5:MEASURED_AS]->(c)
MERGE (c)-[r7:TAKEN_FROM_SUBJECT]->(sb)
FOREACH(ignoreMe IN CASE WHEN units IS NOT null THEN [1] ELSE [] END |
MERGE (un:Unit {{Name:units}})
MERGE (c)-[r6:MEASURED_IN]->(un)
)
FOREACH(ignoreMe IN CASE WHEN biospecimen IS NOT null THEN [1] ELSE [] END |
MERGE (bs:BioSpecimen {{Name:biospecimen}})
MERGE (c)-[r8:FOUND_IN]->(bs)
)
SET r5.PubMed_ID = ""
SET r5.PubMed_ID = pubmed_id + "," + r5.PubMed_ID
WITH split(replace(reference_text, split(reference_text, ":")[0]+": ", ""), ".")[0] AS ref_title,
pubmed_id, reference_text, c
FOREACH(ignoreMe IN CASE WHEN ref_title IS NOT null THEN [1] ELSE [] END |
MERGE (p:Publication {{ Title: ref_title }})
SET p.Authors = split(reference_text, ":")[0]
SET p.Publication = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[0]
SET p.Notes = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[2]
SET p.Date = split(split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[1],";")[0]
SET p.Volume = split(split(reference_text, ";")[1], "(")[0]
SET p.Issue = split(split(reference_text, "(")[1], ")")[0]
SET p.Pages = split(split(reference_text, ":")[-1], ".")[0]
SET p.PubMed_ID = pubmed_id
MERGE (c)-[r2:CITED_IN]->(p)
)
""")
[docs]def add_taxonomy(filename):
"""
Creates "Taxonomy" nodes based on XML files obtained from the HMDB website.
These represent the "kind" of metabolite we are dealing with (Family, etc)
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: It only creates relationships in the Kingdom -> Super Class -> Class -> Subclass
direction, and from any node -> Metabolite. This means that, if any member of the
Kingdom -> Super Class -> Class -> Subclass is absent, the line will be broken; hopefully
in that case a new metabolite will come in to rescue and settle the relation!
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "taxonomy"] AS taxonomy
MERGE (m:Metabolite {{HMDB_ID:accession}})
WITH taxonomy, m
UNWIND taxonomy as my_nodes
WITH
[X IN my_nodes._children WHERE X._type = "description"][0]._text AS description,
[X IN my_nodes._children WHERE X._type = "direct_parent"][0]._text AS direct_parent,
[X IN my_nodes._children WHERE X._type = "kingdom"][0]._text AS kingdom,
[X IN my_nodes._children WHERE X._type = "super_class"][0]._text AS super_class,
[X IN my_nodes._children WHERE X._type = "class"][0]._text AS class,
[X IN my_nodes._children WHERE X._type = "sub_class"][0]._text AS sub_class,
[X IN my_nodes._children WHERE X._type = "alternative_parents"] AS alternative_parents,
[X IN my_nodes._children WHERE X._type = "substituents"] AS substituents,
[X IN my_nodes._children WHERE X._type = "external_descriptors"] AS external_descriptors,
m
SET m.Description = apoc.text.capitalize(description)
// First, we create the Taxonomy nodes independently
FOREACH(ignoreMe IN CASE WHEN kingdom IS NOT null THEN [1] ELSE [] END |
MERGE (k:Taxonomy {{Type:"Kingdom", Name:kingdom}})
)
FOREACH(ignoreMe IN CASE WHEN super_class IS NOT null THEN [1] ELSE [] END |
MERGE (sp:Taxonomy {{Type:"Super Class", Name:super_class}})
)
FOREACH(ignoreMe IN CASE WHEN class IS NOT null THEN [1] ELSE [] END |
MERGE (c:Taxonomy {{Type:"Class", Name:class}})
)
FOREACH(ignoreMe IN CASE WHEN sub_class IS NOT null THEN [1] ELSE [] END |
MERGE (sb:Taxonomy {{Type:"Sub Class", Name:sub_class}})
)
FOREACH(ignoreMe IN CASE WHEN direct_parent IS NOT null THEN [1] ELSE [] END |
MERGE (dp:Taxonomy {{Name:direct_parent}})
MERGE (m)-[:PART_OF_CLADE]->(dp)
)
// Then, we add a hierarchy connecting the nodes as much as possible between them
FOREACH(ignoreMe IN CASE WHEN kingdom IS NOT null AND super_class IS NOT null THEN [1] ELSE [] END |
MERGE (k:Taxonomy {{ Type:"Kingdom", Name:kingdom }})
MERGE (sp:Taxonomy {{ Type:"Super Class", Name:super_class }})
MERGE (k)-[:PART_OF_CLADE]->(sp)
)
FOREACH(ignoreMe IN CASE WHEN class IS NOT null AND super_class IS NOT null THEN [1] ELSE [] END |
MERGE (c:Taxonomy {{ Type:"Class", Name:class }})
MERGE (sp:Taxonomy {{ Type:"Super Class", Name:super_class }})
MERGE (sp)-[:PART_OF_CLADE]->(c)
)
FOREACH(ignoreMe IN CASE WHEN sub_class IS NOT null AND class IS NOT null THEN [1] ELSE [] END |
MERGE (c:Taxonomy {{ Type:"Class", Name:class }})
MERGE (sb:Taxonomy {{ Type:"Sub Class", Name:sub_class }})
MERGE (sb)-[:PART_OF_CLADE]->(c)
)
// And we connect the hierarchy to the main node just once
FOREACH(ignoreMe IN CASE WHEN sub_class IS NOT null THEN [1] ELSE [] END |
MERGE (ta:Taxonomy {{ Name:sub_class }})
MERGE (m)-[:PART_OF_CLADE]->(ta)
)
FOREACH(ignoreMe IN CASE WHEN class IS NOT null
AND sub_class IS null THEN [1] ELSE [] END |
MERGE (ta:Taxonomy {{ Name:class }})
MERGE (m)-[:PART_OF_CLADE]->(ta)
)
FOREACH(ignoreMe IN CASE WHEN super_class IS NOT null
AND sub_class IS null AND class IS null THEN [1] ELSE [] END |
MERGE (ta:Taxonomy {{ Name:super_class }})
MERGE (m)-[:PART_OF_CLADE]->(ta)
)
FOREACH(ignoreMe IN CASE WHEN kingdom IS NOT null AND sub_class IS null
AND class IS null AND super_class IS null THEN [1] ELSE [] END |
MERGE (ta:Taxonomy {{ Name:kingdom }})
MERGE (m)-[:PART_OF_CLADE]->(ta)
)
// We add the alternative_parents in the appropriate format
FOREACH(element in alternative_parents|
FOREACH(taxonomy in element._children|
MERGE (t:Taxonomy {{Name:taxonomy._text}})
MERGE (m)-[:PART_OF_CLADE]->(t)
)
)
// If any Taxonomy is left without a connection, we connect it to the main graph
// Beware: if any disconnected taxonomy is left from before, this could lead to errors
WITH m, alternative_parents
MATCH (tt:Taxonomy) WHERE NOT (tt)--()
MERGE (m)-[:PART_OF_CLADE]->(tt)
""")
[docs]def add_experimental_properties(filename):
"""
Adds properties to existing "Metabolite" nodes based on XML files obtained from the HMDB website.
In this case, only properties labeled as <experimental_properties> are added.
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: Another option would have been to auto-add all the properties, and name them using
RETURN "Experimental " + apoc.text.capitalizeAll(replace(kind, "_", " ")), value; however, this
way we can select and not duplicate / overwrite values.
.. TODO:: It would be nice to be able to distinguish between experimental and predicted properties
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "experimental_properties"] AS experimental_properties
UNWIND experimental_properties as experimental_property
WITH experimental_property, accession
UNWIND experimental_property["_children"] AS my_property
WITH my_property, accession
WITH
[X in my_property._children WHERE X._type = "kind"][0]._text AS kind,
[X in my_property._children WHERE X._type = "value"][0]._text AS value,
accession
MERGE (m:Metabolite {{HMDB_ID:accession}})
WITH apoc.map.fromLists(collect(kind), collect(value)) AS dict, m
SET m.Water_Solubility = dict["water_solubility"], m.logP = dict["logp"],
m.Melting_Point = dict["melting_point"], m.Boiling_Point = dict["boiling_point"]
""")
[docs]def add_predicted_properties(filename):
"""
Adds properties to existing "Metabolite" nodes based on XML files obtained from the HMDB website.
In this case, only properties labeled as <predicted_properties> are added.
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: Another option would have been to auto-add all the properties, and name them using
RETURN "Predicted " + apoc.text.capitalizeAll(replace(kind, "_", " ")), value; however, this
way we can select and not duplicate / overwrite values.
.. TODO:: It would be nice to be able to distinguish between experimental and predicted properties
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "predicted_properties"] AS predicted_properties
UNWIND predicted_properties as predicted_property
WITH predicted_property, accession
UNWIND predicted_property["_children"] AS my_property
WITH my_property, accession
WITH
[X in my_property._children WHERE X._type = "kind"][0]._text AS kind,
[X in my_property._children WHERE X._type = "value"][0]._text AS value,
accession
MERGE (m:Metabolite {{HMDB_ID:accession}})
WITH apoc.map.fromLists(collect(kind), collect(value)) AS dict, m
SET m.Bioavailability = dict["bioavailability"],
m.Donor_Count = dict["donor_count"], m.Polar_Surface_Area = dict["polar_surface_area"],
m.Ro5 = dict["rule_of_five"], m.pKa_Strongest_Acidic = dict["pka_strongest_acidic"],
m.pKa_Strongest_Basic = dict["pka_strongest_basic"], m.Number_of_Rings = dict["number_of_rings"],
m.Physiological_Charge = dict["physiological_charge"], m.Polarizability = dict["polarizability"],
m.logS = dict["logs"], m.MDDR_Like_Rule = dict["mddr_like_rule"],
m.Ghose_Filter = dict["ghose_filter"], m.Refractivity = dict["refractivity"],
m.Rotatable_Bond_Count = dict["rotatable_bond_count"], m.Acceptor_Count = dict["acceptor_count"],
m.Formal_Charge = dict["formal_charge"], m.Verber_Rule = dict["veber_rule"]
""")
[docs]def add_biological_properties(filename):
"""
Adds biological properties to existing "Metabolite" nodes based on XML files obtained from the HMDB website.
In this case, only properties labeled as <predicted_properties> are added.
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: Another option would have been to auto-add all the properties, and name them using
RETURN "Predicted " + apoc.text.capitalizeAll(replace(kind, "_", " ")), value; however, this
way we can select and not duplicate / overwrite values.
.. TODO:: It would be nice to be able to distinguish between experimental and predicted properties
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "biological_properties"] AS biological_properties
MERGE (m:Metabolite {{HMDB_ID:accession}})
WITH biological_properties, m
UNWIND biological_properties AS biological_property
WITH biological_property, m
UNWIND biological_property["_children"] AS my_property
WITH
[X IN my_property._children WHERE X._type = "cellular"] AS cellulars,
[X IN my_property._children WHERE X._type = "biospecimen"] AS biospecimens,
[X IN my_property._children WHERE X._type = "tissue"] AS tissues,
[X IN my_property._children WHERE X._type = "pathway"] AS pathways,
m
FOREACH(location IN cellulars|
MERGE (c:CelularLocation {{Name:location._text}})
MERGE (m)-[r:LOCATED_INSIDE_CELL]->(c)
)
FOREACH(location IN biospecimens|
MERGE (b:BioSpecimen {{Name:location._text}})
MERGE (m)-[r:LOCATED_IN_BIOSPECIMEN]->(b)
)
FOREACH(location IN tissues|
MERGE (t:Tissue {{Name:location._text}})
MERGE (m)-[r:LOCATED_IN_TISSUE]->(t)
)
WITH pathways, m
UNWIND pathways as my_pathways
WITH my_pathways, m
WITH
[X in my_pathways._children WHERE X._type = "name"][0]._text AS name,
[X in my_pathways._children WHERE X._type = "smpdb_id"][0]._text AS smpdb_id,
[X in my_pathways._children WHERE X._type = "kegg_map_id"][0]._text AS kegg_map_id,
m
MERGE (p:Pathway {{Name:name}})
SET p.SMPDB_ID = smpdb_id, p.KEGG_ID = kegg_map_id
MERGE (m)-[r:PART_OF_PATHWAY]->(p)
""")
[docs]def add_proteins(filename):
"""
Creates "Protein" nodes based on XML files obtained from the HMDB website.
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: We are not creating "Gene" nodes (even though each protein comes from a given gene)
because we believe not enough information is being given about them.
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "protein"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "name"][0]._text AS name,
[X in metabolite._children WHERE X._type = "protein_type"][0]._text AS protein_type,
[X in metabolite._children WHERE X._type = "gene_name"][0]._text AS gene_name,
[X in metabolite._children WHERE X._type = "general_function"][0]._text AS general_function,
[X in metabolite._children WHERE X._type = "specific_function"][0]._text AS specific_function,
[X in metabolite._children WHERE X._type = "genbank_protein_id"][0]._text AS genbank_protein_id,
[X in metabolite._children WHERE X._type = "uniprot_id"][0]._text AS uniprot_id,
[X in metabolite._children WHERE X._type = "uniprot_name"][0]._text AS uniprot_name,
[X in metabolite._children WHERE X._type = "genbank_gene_id"][0]._text AS genbank_gene_id,
[X in metabolite._children WHERE X._type = "genecard_id"][0]._text AS genecard_id,
[X in metabolite._children WHERE X._type = "geneatlas_id"][0]._text AS geneatlas_id,
[X in metabolite._children WHERE X._type = "hgnc_id"][0]._text AS hgnc_id,
[X in metabolite._children WHERE X._type = "subcellular_locations"] AS subcellular_locations,
[X in metabolite._children WHERE X._type = "secondary_accessions"] AS secondary_accessions,
[X in metabolite._children WHERE X._type = "pdb_ids"] AS pdb_ids,
[X in metabolite._children WHERE X._type = "synonyms"] AS synonyms
MERGE (p:Protein {{ HMDB_ID:accession }} )
SET p.Name = name, p.UniProt_ID = uniprot_id,
p.Function = protein_type, p.Gene_Name = gene_name, p.Function = general_function,
p.Specific_Function = specific_function, p.Genbank_Protein_ID = genbank_protein_id,
p.GenBank_Gene_ID = genbank_gene_id, p.GeneCards_ID = genecard_id,
p.GenAtlas_ID = geneatlas_id, p.HGNC_ID = hgnc_id
WITH secondary_accessions, synonyms, pdb_ids, subcellular_locations, p
FOREACH(element in subcellular_locations|
FOREACH(location in element._children|
MERGE (c:CelularLocation {{Name:location._text}})
MERGE (p)-[r:LOCATED_INSIDE_CELL]->(c)
)
)
SET p.Synonyms = "", p.Secondary_HMDB_IDs = "", p.PDB_ID = ""
FOREACH(element in secondary_accessions|
FOREACH(accession in element._children|
SET p.Secondary_HMDB_IDs = accession._text + "," + p.Secondary_HMDB_IDs
)
)
FOREACH(element in synonyms|
FOREACH(synonym in element._children|
SET p.Synonyms = synonym._text + "," + p.Synonyms
)
)
FOREACH(element in pdb_ids|
FOREACH(pdb in element._children|
SET p.PDB_ID = pdb._text + "," + p.PDB_ID
)
)
""")
[docs]def add_go_classifications(filename):
"""
Creates "Gene Ontology" nodes based on XML files obtained from the HMDB website.
This relates each protein to some GO-Terms
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "protein"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "go_classifications"] AS go_classifications
MERGE (p:Protein {{ HMDB_ID:accession }})
WITH go_classifications, p
UNWIND go_classifications AS go_class
WITH go_class, p
UNWIND go_class["_children"] AS my_class
WITH
[X in my_class._children WHERE X._type = "category"][0]._text AS category,
[X in my_class._children WHERE X._type = "description"][0]._text AS description,
[X in my_class._children WHERE X._type = "go_id"][0]._text AS go_id,
p
MERGE (g:GeneOntology {{ Description:description }})
SET g.GO_ID = go_id, g.Category = category
MERGE (p)-[r:PART_OF_GENE_ONTOLOGY]-(g)
""")
[docs]def add_gene_properties(filename):
"""
Adds some properties to existing "Protein" nodes based on XML files obtained from the HMDB website.
In this case, properties will mostly relate to the gene from which the protein originates.
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: We are not creating "Gene" nodes (even though each protein comes from a given gene)
because we believe not enough information is being given about them.
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "protein"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "gene_properties"] AS gene_properties
MERGE (p:Protein {{ HMDB_ID:accession }})
WITH gene_properties, p
UNWIND gene_properties AS gene_property
WITH
[X in gene_property._children WHERE X._type = "chromosome_location"][0]._text AS chromosome_location,
[X in gene_property._children WHERE X._type = "locus"][0]._text AS locus,
[X in gene_property._children WHERE X._type = "gene_sequence"][0]._text AS gene_sequence,
p
WITH
replace(replace(gene_sequence, split(gene_sequence, "bp")[0]+"bp", ""), " ", "") as SEQUENCE,
chromosome_location, locus, gene_sequence, p
FOREACH(ignoreMe IN CASE WHEN SEQUENCE IS NOT null AND SEQUENCE <> "" THEN [1] ELSE [] END |
MERGE (se:Sequence {{ Sequence:SEQUENCE }} )
SET se.Type= "DNA", se.Chromosome_Location = chromosome_location, se.Locus = locus
MERGE (p)-[r:SEQUENCED_AS]->(se)
)
""")
[docs]def add_protein_properties(filename):
"""
Adds some properties to existing "Protein" nodes based on XML files obtained from the HMDB website.
In this case, properties will mostly relate to the protein itself.
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: The "signal_regions" and the "transmembrane_regions" properties were left out
because, after a preliminary search, they were mostly empty
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "protein"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "protein_properties"] AS protein_properties,
[X in metabolite._children WHERE X._type = "uniprot_id"][0]._text AS uniprot_id
MERGE (p:Protein {{ HMDB_ID:accession }})
WITH protein_properties, p, uniprot_id
UNWIND protein_properties AS protein_property
WITH
[X in protein_property._children WHERE X._type = "residue_number"][0]._text AS residue_number,
[X in protein_property._children WHERE X._type = "molecular_weight"][0]._text AS molecular_weight,
[X in protein_property._children WHERE X._type = "theoretical_pi"][0]._text AS theoretical_pi,
[X in protein_property._children WHERE X._type = "pfams"] AS pfams,
[X in protein_property._children WHERE X._type = "polypeptide_sequence"][0]._text AS polypeptide_sequence,
p, uniprot_id
SET p.Residue_Number = residue_number, p.Molecular_Weight = molecular_weight,
p.Theoretical_PI = theoretical_pi
FOREACH(ignoreMe IN CASE WHEN polypeptide_sequence IS NOT null AND polypeptide_sequence <> "" THEN [1] ELSE [] END |
MERGE (se:Sequence {{ Sequence: polypeptide_sequence }} )
SET se.Type= "PROT", se.UniProt_ID = uniprot_id
MERGE (p)-[r:SEQUENCED_AS]->(se)
)
WITH p, pfams
UNWIND pfams AS pfam
WITH p, pfam
UNWIND pfam["_children"] AS my_pfam
WITH
[X in my_pfam._children WHERE X._type = "name"][0]._text AS name,
[X in my_pfam._children WHERE X._type = "pfam_id"][0]._text AS pfam_id,
p
MERGE (pf:PFam {{ PFAM_ID:pfam_id }})
SET pf.Name = name
MERGE (p)-[r:PART_OF_PFAM]->(pf)
""")
[docs]def add_general_references(filename, type_of):
"""
Creates "Publication" nodes based on XML files obtained from the HMDB website.
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: Since not all nodes present a "PubMed_ID" field (which would be ideal to uniquely-identify
Publications, as the "Text" field is way more prone to typos/errors), nodes will be created using
the "Authors" field. This means some duplicates might exist, which should be accounted for.
.. NOTE:: Unlike the rest, here we are not matching metabolites, but ALSO proteins. This is intentional.
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "{type_of}"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "general_references"] AS general_references
MATCH (m) WHERE (m:Metabolite OR m:Protein) AND m.HMDB_ID = accession
WITH general_references, m
UNWIND general_references AS general_reference
WITH general_reference, m
UNWIND general_reference["_children"] AS my_reference
WITH
[X in my_reference._children WHERE X._type = "reference_text"][0]._text AS reference_text,
[X in my_reference._children WHERE X._type = "pubmed_id"][0]._text AS pubmed_id,
m
WITH split(replace(reference_text, split(reference_text, ":")[0]+": ", ""), ".")[0] AS ref_title,
pubmed_id, reference_text, m
FOREACH(ignoreMe IN CASE WHEN ref_title IS NOT null THEN [1] ELSE [] END |
MERGE (p:Publication {{ Title: ref_title }})
SET p.Authors = split(reference_text, ":")[0]
SET p.Publication = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[0]
SET p.Notes = split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[2]
SET p.Date = split(split(replace(reference_text, split(reference_text, ".")[0]+". ",""), ".")[1],";")[0]
SET p.Volume = split(split(reference_text, ";")[1], "(")[0]
SET p.Issue = split(split(reference_text, "(")[1], ")")[0]
SET p.Pages = split(split(reference_text, ":")[-1], ".")[0]
SET p.PubMed_ID = pubmed_id
MERGE (m)-[r:CITED_IN]->(p)
)
""")
[docs]def add_protein_associations(filename):
"""
Creates "Protein" nodes based on XML files obtained from the HMDB website.
Args:
tx (neo4j.Session): The session under which the driver is running
filename (str): The name of the XML file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: Unlike the "add_protein" function, this creates Proteins based on info on the
"Metabolite" files, not on the "Protein" files themselves. This could mean node duplication, but,
hopefully, the MERGE by Accession will mean that this duplicates will be catched.
"""
return (f"""
CALL apoc.load.xml("file:///{filename}")
YIELD value
WITH [x in value._children WHERE x._type = "metabolite"] AS metabolites
UNWIND metabolites AS metabolite
WITH
[X in metabolite._children WHERE X._type = "accession"][0]._text AS accession,
[X in metabolite._children WHERE X._type = "protein_associations"] AS protein_associations
MERGE (m:Metabolite {{HMDB_ID:accession}})
WITH protein_associations, m
UNWIND protein_associations AS protein_association
WITH protein_association, m
UNWIND protein_association["_children"] AS my_protein
WITH
[X in my_protein._children WHERE X._type = "protein_accession"][0]._text AS protein_accession,
[X in my_protein._children WHERE X._type = "name"][0]._text AS name,
[X in my_protein._children WHERE X._type = "uniprot_id"][0]._text AS uniprot_id,
[X in my_protein._children WHERE X._type = "gene_name"][0]._text AS gene_name,
[X in my_protein._children WHERE X._type = "protein_type"][0]._text AS protein_type,
m
MERGE (p:Protein {{ HMDB_ID:protein_accession }})
ON CREATE SET p.Gene_Name = gene_name, p.Function = protein_type,
p.UniProt_ID = uniprot_id, p.Name = name
MERGE (m)-[r:INTERACTS_WITH]-(p)
""")
[docs]def build_from_protein_file(newfile, driver):
"""
A function able to build a portion of the HMDB database in graph format, provided that one "Protein" XML is supplied to it.
This are downloaded separately from the website, as ```hmdb_proteins.zip```, and can be presented either as the full file,
or as a splitted version of it, with just one item per file (which is recommended due to memory limitations)
Args:
newfile (str): The path of the XML file to import
driver (neo4j.Driver): Neo4J's Bolt Driver currently in use
Returns:
This function modifies the Neo4J Database as desired, but does not produce any particular return.
"""
misc.manage_transaction(add_proteins(newfile), driver)
misc.manage_transaction(add_go_classifications(newfile), driver)
misc.manage_transaction(add_gene_properties(newfile), driver)
misc.manage_transaction(add_protein_properties(newfile), driver)
misc.manage_transaction(add_metabolite_associations(newfile), driver)
misc.manage_transaction(add_metabolite_references(newfile), driver)
misc.manage_transaction(add_general_references(newfile, "protein"), driver)