#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2022 Pablo Marcos <software@loreak.org>
#
# SPDX-License-Identifier: MIT
"""
A python module that provides the necessary functions to transition the MetaNetX database
(and related MeSH terms and KEGG IDs) to graph format, either from scratch importing all
the nodes (as showcased in :obj:`CanGraph.MeSHandMetaNetX.main`) or in a case-by-case basis,
to annotate existing metabolites (as showcased in :obj:`CanGraph.main`).
.. NOTE:: You may notice some functions here present the ``**kwargs`` arguments option.
This is in order to make the functions compatible with the
:obj:`CanGraph.miscelaneous.manage_transaction` function, which might send back a variable
number of arguments (although technically it could work without the ``**kwargs`` option)
"""
import json # Read JSON files from Python
import urllib.request as request # Extensible library for opening URLs
import urllib.parse # Parse URLs in Python
import re # Regular expression search
import time # Manage the time, and wait times, in python
import os, sys, shutil # Vital modules to interact with the filesystem
# Import subscripts for the program
# This hack that allows us to de-duplicate the miscleaneous script in this less-used script
sys.path.append("../")
# .. NOTE::: Please beware that, if using this module by itself, you might need to copy "miscelaneous.py" into your path
# This is not the most elegant, but simplifies code maintenance, and this script shouldnt be used much so...
import miscelaneous as misc
# ********* SPARQL queries to annotate existing nodes using MeSH ********* #
[docs]def add_mesh_by_name():
"""
A function that adds some MeSH nodes to any existing nodes, based on their Name property.
Only currently active MeSH_IDs are parsed
Returns:
str: A text chain that represents the CYPHER query with the desired output. This can be run using: :obj:`neo4j.Session.run`
.. NOTE:: Only exact matches work here, which is not ideal.
.. NOTE:: Be careful when writing CYPHER commands for the driver: sometimes, \" != \' !!!
.. versionchanged:: 1.0
Reverted the filtering to old version in order to make the search more specific
"""
return """
CALL {
MATCH (n)
WHERE (n:Metabolite OR n:Protein OR n:Drug OR n:Disease OR n:Pathway OR n:Tissue
OR n:CelularLocation OR n:BioSpecimen OR n:Gene OR n:Cause OR n:AdministrationRoute)
AND n.Name IS NOT null AND n.Name <> ""
WITH '
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
PREFIX mesh2022: <http://id.nlm.nih.gov/mesh/2022/>
PREFIX mesh2021: <http://id.nlm.nih.gov/mesh/2021/>
PREFIX mesh2020: <http://id.nlm.nih.gov/mesh/2020/>
SELECT DISTINCT *
FROM <http://id.nlm.nih.gov/mesh>
WHERE
{ ?MeSH_Descriptor_ID
a meshv:Descriptor ;
meshv:concept ?MeSH_Concept_ID ;
meshv:active 1 ;
rdfs:label ?MeSH_Descriptor_Name .
?MeSH_Concept_ID
meshv:active 1 ;
rdfs:label ?MeSH_Concept_Name
FILTER ( ( ?MeSH_Descriptor_Name = \"' + replace(n.Name, " ", " ") + '\"@en ) ||
( ?MeSH_Concept_Name = \"' + replace(n.Name, " ", " ") + '\"@en ) )
}' AS sparql, n
CALL apoc.load.jsonParams(
replace("https://id.nlm.nih.gov/mesh/sparql?query="
+ apoc.text.urlencode(sparql) + "&format=JSON&limit=50&offset=0&inference=true", "\n", ""),
{ Accept: "application/sparql-results+json" }, null )
YIELD value
UNWIND value['results']['bindings'] as row
MERGE (c:MeSH { MeSH_ID:split(row['MeSH_Descriptor_ID']['value'],'/')[-1] , Type:"Descriptor", Name: row['MeSH_Descriptor_Name']['value']})
MERGE (n)-[r:RELATED_MESH]->(c)
MERGE (cc:MeSH { MeSH_ID:split(row['MeSH_Concept_ID']['value'],'/')[-1] , Type:"Concept", Name: row['MeSH_Concept_Name']['value']})
MERGE (n)-[r2:RELATED_MESH]->(cc)
} IN TRANSACTIONS OF 10 rows
"""
# ********* SPARQL queries to annotate existing nodes using MetaNetX ********* #
[docs]def add_prefixes():
"""
Add some prefixes necessary for all MetaNetX queries to work.
This are kept together since adding extra prefixes does not increase computation time
Returns:
str: A text chain that represents the CYPHER query with the desired output. This can be run using: :obj:`neo4j.Session.run`
"""
return("""
PREFIX mnx: <https://rdf.metanetx.org/schema/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/uniprot/>
PREFIX chebi: <http://purl.obolibrary.org/obo/CHEBI_>
PREFIX hmdb: <https://identifiers.org/hmdb:>
PREFIX keggC: <https://identifiers.org/kegg.compound:>
PREFIX keggR: <https://identifiers.org/kegg.reaction:>
""")
[docs]def get_identifiers(from_sparql=False, **kwargs):
"""
Part of a CYPHER query that processes the outcome from a SPARQL query that searches for information on MetaNetX
It takes an original metabolite (n) and a row variable, which should have columns named external_identifier,
cross_refference, InChIKey, InChI, SMILES, Formula and Mass with the adequated format; it is basically a code-reuser,
not intended to be used separately.
Args:
from_sparql (bool): A True/False param defining whether the identifiers are being parsed from a SPARQL query;
default is False (i.e. imported from file)
**kwargs: Any number of arbitrary keyword arguments
Returns:
str: A text chain that represents the CYPHER query with the desired output. This can be run using: :obj:`neo4j.Session.run`
.. NOTE:: All HMDB matches might create a Metabolite without CHEBI_ID or CAS_Number, which would violate our schema. This will be later on accounted for.
.. NOTE:: Some keys, such as VMH_ID, are not merged into their own node, but rather added to an existing one.
This is because this do not prevously exist in our Schema, and might be changed in the future.
.. NOTE:: We dont care about overwriting InChI and InChIKey because they are necessarily unique; the same is true for Mass and Formula,
as they are not all that important. However, for HMDB ID and others, we will take care not to overwrite, which could mess up the DB
"""
if from_sparql == True:
sparql_parser = ["""
WITH
toLower(split(row["external_identifier"]["value"], ":")[0]) as databasename,
split(row["external_identifier"]["value"], ":")[-1] as databaseid,
row["cross_refference"]["value"] as url, row["InChI"]["value"] as InChI,
row["InChIKey"]["value"] as InChIKey, row["SMILES"]["value"] as SMILES,
row["Formula"]["value"] as Formula, row["Mass"]["value"] as Mass,
split(row['mnx_url']['value'],'/')[-1] as MetaNetX_ID,
split(row['Isomers']['value'],'/')[-1] as Isomer,
n
FOREACH(ignoreme in case when MetaNetX_ID IS NOT NULL
AND MetaNetX_ID <> "" then [1] else [] end |
SET n.MetaNetX_ID = MetaNetX_ID
)
FOREACH(ignoreme in case when InChIKey IS NOT NULL
AND InChIKey <> "" then [1] else [] end |
SET n.InChIKey = InChIKey
)
FOREACH(ignoreme in case when InChI IS NOT NULL
AND InChI <> "" then [1] else [] end |
SET n.InChI = InChI
)
FOREACH(ignoreme in case when Formula IS NOT NULL
AND Formula <> "" then [1] else [] end |
SET n.Formula = Formula
)
FOREACH(ignoreme in case when Mass IS NOT NULL
AND Mass <> "" then [1] else [] end |
SET n.Average_Mass = Mass
)
FOREACH(ignoreme in case when SMILES IS NOT NULL
AND SMILES <> "" then [1] else [] end |
SET n.SMILES = SMILES
)
""",
"""
WITH n, databasename, databaseid, MetaNetX_ID, Isomer
MATCH (m) WHERE (m:Metabolite OR m:Protein)
FOREACH(ignoreme in case when databasename = "hmdb" then [1] else [] end |
FOREACH(ignoreme in case when databaseid IN n.Secondary_HMDB_IDs then [1] else [] end |
MERGE (n)-[r:SYNONYM_OF]-(m)
FOREACH(ignoreme in case when MetaNetX_ID IS NOT NULL AND MetaNetX_ID <> "" then [1] else [] end |
SET m.MetaNetX_ID = MetaNetX_ID
)
)
)
FOREACH(ignoreme in case when Isomer IS NOT NULL AND Isomer <> "" then [1] else [] end |
MERGE (z:Metabolite { MetaNetX_ID:Isomer })
MERGE (z)-[r:ISOMER_OF]-(n)
)
"""]
else: sparql_parser = ["",""]
return f"""
{sparql_parser[0]}
FOREACH(ignoreme in case when databasename = "chebi"
AND n.ChEBI_ID <> databaseid then [1] else [] end |
MERGE (m {{ChEBI_ID: databaseid}})
MERGE (n)-[r:SYNONYM_OF]-(m)
FOREACH(ignoreme in case when size(labels(m)) < 1 then [1] else [] end |
SET m:Metabolite
)
)
FOREACH(ignoreme in case when databasename = "kegg.compound"
OR databasename = "keggc" AND n.KEGG_ID <> databaseid then [1] else [] end |
MERGE (m {{KEGG_ID: split(databaseid, "M_")[-1]}})
MERGE (n)-[r:SYNONYM_OF]-(m)
FOREACH(ignoreme in case when size(labels(m)) < 1 then [1] else [] end |
SET m:Metabolite
)
)
FOREACH(ignoreme in case when databasename = "hmdb"
AND n.HMDB_ID <> databaseid then [1] else [] end |
MERGE (m {{HMDB_ID: split(databaseid, "M_")[-1]}})
MERGE (n)-[r:SYNONYM_OF]-(m)
FOREACH(ignoreme in case when size(labels(m)) < 1 then [1] else [] end |
SET m:Metabolite
)
)
FOREACH(ignoreme in case when databasename = "vmhmetabolite" then [1] else [] end |
SET n.VMH_ID = databaseid
)
FOREACH(ignoreme in case when databasename = "biggm"
OR databasename = "bigg.metabolite" then [1] else [] end |
SET n.BiGG_ID = databaseid
)
FOREACH(ignoreme in case when databasename = "sabiork.compound"
OR databasename = "sabiorkm" then [1] else [] end |
SET n.SabioRK_ID = databaseid
)
FOREACH(ignoreme in case when databasename = "seed.compound" then [1] else [] end |
SET n.ModelSeed_ID = databaseid
)
FOREACH(ignoreme in case when databasename = "seedm" then [1] else [] end |
SET n.ModelSeed_ID = split(databaseid, "M_")[-1]
)
FOREACH(ignoreme in case when databasename = "metacyc.compound"
OR databasename = "metacycm" then [1] else [] end |
SET n.MetaCyc_ID = databaseid
)
FOREACH(ignoreme in case when databasename = "slm" then [1] else [] end |
SET n.SwissLipids_ID = databaseid
)
FOREACH(ignoreme in case when databasename = "envipath"
OR databasename = "envipathm" then [1] else [] end |
SET n.EnviPath_ID = databaseid
)
{sparql_parser[1]}
"""
# ********* Build the entire MetaNetX DB as a graph under our format ********* #
[docs]def add_chem_xref(filename):
"""
A CYPHER query that loads the `chem_xref.tsv` file availaible at the MetaNetX site, using a graph format.
Args:
filename (str): The name of the CSV file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: For performance, it is recommended to split the file in 1 subfile for each row in the DataBase
"""
return f"""
LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line FIELDTERMINATOR '\t'
MERGE (n:Metabolite {{ MetaNetX_ID:line.ID }})
SET n.Description = line.description
WITH toLower(split(line["#source"], ":")[0]) as databasename,
split(line["#source"], ":")[-1] as databaseid,
n
{ get_identifiers(from_sparql = False) }
"""
[docs]def add_chem_prop(filename):
"""
A CYPHER query that loads the `chem_prop.tsv` file availaible at the MetaNetX site, using a graph format.
Args:
filename (str): The name of the CSV file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: For performance, it is recommended to split the file in 1 subfile for each row in the DataBase
"""
return f"""
LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line FIELDTERMINATOR '\t'
MERGE (n:Metabolite {{ MetaNetX_ID:line["#ID"] }})
SET n.Name = line.name, n.Formula = line.Formula, n.Charge = line.charge, n.Average_Mass = line.mass,
n.InChI = line.InChI, n.InChIKey = line.InChIKey, n.SMILES = line.SMILES
"""
[docs]def add_chem_isom(filename):
"""
A CYPHER query that loads the `chem_isom.tsv` file availaible at the MetaNetX site, using a graph format.
Args:
filename (str): The name of the CSV file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: For performance, it is recommended to split the file in 1 subfile for each row in the DataBase
"""
return f"""
LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line FIELDTERMINATOR '\t'
MERGE (n:Metabolite {{ MetaNetX_ID:line["#parent"] }})
MERGE (m:Metabolite {{ MetaNetX_ID:line["child"] }})
MERGE (m)-[r:ISOMER_OF]-(m)
SET n.Alternative_names = split(line.description," -> ")[0] + "," + n.Alternative_names
SET m.Alternative_names = split(line.description," -> ")[1] + "," + m.Alternative_names
"""
[docs]def add_comp_xref(filename):
"""
A CYPHER query that loads the `comp_xref.tsv` file availaible at the MetaNetX site, using a graph format.
Args:
filename (str): The name of the CSV file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: For performance, it is recommended to split the file in 1 subfile for each row in the DataBase
.. NOTE:: Some identifiers present the CL/cl prefix. Since I could not find what this prefix refers to,
and since it only pertains to one single MetaNetX ID, we did not take them into account
.. NOTE:: The "description" field in the DataBase is ignored, since it seems to be quite similar, but less useful,
than the "name" field from comp_prop, which is more coherent with our pre-existing schema
"""
return f"""
LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line FIELDTERMINATOR '\t'
MERGE (n:CelularLocation {{ MetaNetX_ID:line.ID }})
WITH toLower(split(line["#source"], ":")[0]) as databasename,
split(line["#source"], ":")[-1] as databaseid,
n
FOREACH(ignoreme in case when databasename = "biggc" OR databasename = "bigg.compartment" then [1] else [] end |
SET n.BiGG_ID = databaseid
)
FOREACH(ignoreme in case when databasename = "go" then [1] else [] end |
SET n.GO_ID = databaseid
)
FOREACH(ignoreme in case when databasename = "seedc" then [1] else [] end |
SET n.ModelSeed_ID = databaseid
)
FOREACH(ignoreme in case when databasename = "cl" then [1] else [] end |
SET n.ModelSeed_ID = databaseid
)
FOREACH(ignoreme in case when databasename = "cco" then [1] else [] end |
SET n.Cell_Component_Ontology_ID = databaseid
)
"""
[docs]def add_comp_prop(filename):
"""
A CYPHER query that loads the `comp_prop.tsv` file availaible at the MetaNetX site, using a graph format.
Args:
filename (str): The name of the CSV file that is being imported
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: For performance, it is recommended to split the file in 1 subfile for each row in the DataBase
"""
return f"""
LOAD CSV WITH HEADERS FROM ('file:///{filename}') AS line FIELDTERMINATOR '\t'
MERGE (n:CelularLocation {{ MetaNetX_ID:line["#ID"] }})
SET n.Name = line.name
"""
[docs]def add_pept():
"""
A CYPHER query that all the protein availaible at the MetaNetX site, using a graph format and SPARQL.
Returns:
str: A text chain that represents the CYPHER query with the desired output. This can be run using: :obj:`neo4j.Session.run`
.. NOTE:: SPARQL was only used here because, unlike with the other files, there is no download available;
also, given there are few proteins, Neo4J is able to process it without running out of memory
(unlike what happened with the other fields)
.. NOTE:: This is an **autocommit transaction**. This means that, in order to not keep data in memory
(and make running it with a huge amount of data) more efficient, you will need to add ```:auto ```
when calling it from the Neo4J browser, or call it as ```session.run( clean_database() )``` from the driver.
"""
return f"""
WITH '
{ add_prefixes() }
SELECT DISTINCT *
WHERE{{
?protein a mnx:PEPT ;
mnx:peptXref ?cross_refference .
}}' AS sparql
CALL apoc.load.jsonParams(
replace("https://rdf.metanetx.org/sparql/?query=" + sparql, "\n", ""),
{{ Accept: "application/sparql-results+json" }}, null )
YIELD value
UNWIND value['results']['bindings'] as row
CALL {{
WITH row
MERGE (p:Metabolite {{MetaNetX_ID:split(row['protein']['value'],'/')[-1]}})
SET p:Protein, p.UniProt_ID = split(row['cross_refference']['value'],'/')[-1]
}} IN TRANSACTIONS OF 100 rows
"""
# ********* Annotate existing nodes using KEGG Pathways and Component IDs ********* #
# ********* Translate a given identifier to another using The Chemical Translation Service ********* #
[docs]def find_synonyms_in_cts(fromIdentifier, toIdentifier, searchTerm):
"""
Finds synonyms for a given metabolite in CTS, The Chemical Translation Service
Args:
fromIdentifier (str): The name of the database from which we want the conversion
toIdentifier (str): The name of the database to which we want the conversion
searchTerm (str): The search termm, which should be an ID of database: ``fromIdentifier``
Returns:
str: The requested synonym
.. NOTE:: Please, be sure to use a database name that is in compliance with those specified in
CTS itself; if you dont, this function will fail with a 500 error
.. NOTE:: To prevent random downtimes from crashing the function, any one URL will be tried at least 5 times
before crashing (see: `StackOverflow #9446387 <https://stackoverflow.com/questions/9446387/how-to-retry-urllib2-request-when-fails>`_
"""
src = urllib.parse.quote(fromIdentifier)
dest = urllib.parse.quote(toIdentifier)
query = urllib.parse.quote(searchTerm)
for x in range(5):
try:
results_file = request.urlopen(f"https://cts.fiehnlab.ucdavis.edu/rest/convert/{src}/{dest}/{query}")
data = json.load(results_file)
result = data[0]['results']
break # success
except request.URLError as E:
# For timeouts and internal server errors, just asume no result would be given
# (weird that I'm finding these)
try:
error_code = E.code
if E.code == 500 or E.code == 504:
result = ""
break
else:
result = result
except:
result = ""
error_code = "Undefined"
if x < 4:
print("An error with the URL has been detected. Retrying in 2 seconds...")
time.sleep(2)
else:
raise Exception(f"An HTTP Error with Code: {error_code} was found. Aborting...")
return result
# ********* Build from file ********* #
[docs]def build_from_file(filename, driver):
"""
A function able to build a portion of the MetaNetX database in graph format, provided that one MetaNetX CSV is supplied to it.
This CSVs are downloaded from the website, and can be presented either as the full file, or as a splitted
version of it, with just one item per file (which is recommended due to memory limitations). If you want all the database to be
imported, you should run this function with all the CSVs that form it, as portrayed in the :obj:`~CanGraph.MeSHandMetaNetX.main` module
Args:
driver (neo4j.Driver): Neo4J's Bolt Driver currently in use
filename (str): The name of the CSV file that is being imported
Returns:
This function modifies the Neo4J Database as desired, but does not produce any particular return.
"""
if "chem_xref" in filename:
misc.manage_transaction(add_chem_xref(filename), driver)
elif "chem_prop" in filename:
misc.manage_transaction(add_chem_prop(filename), driver)
elif "chem_isom" in filename:
misc.manage_transaction(add_chem_isom(filename), driver)
elif "comp_xref" in filename:
misc.manage_transaction(add_comp_xref(filename), driver)
elif "comp_prop" in filename:
misc.manage_transaction(add_comp_prop(filename), driver)