#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2022 Pablo Marcos <software@loreak.org>
#
# SPDX-License-Identifier: MIT
"""
A python module that provides the necessary functions to transition selected parts of the Wikidata database to graph format,
either from scratch importing all the nodes (as showcased in :obj:`CanGraph.QueryWikidata.main`) or in a case-by-case basis,
to annotate existing metabolites (as showcased in :obj:`CanGraph.main`).
.. NOTE:: You may notice some functions here present the ``**kwargs`` arguments option.
This is in order to make the functions compatible with the
:obj:`CanGraph.miscelaneous.manage_transaction` function, which might send back a variable
number of arguments (although technically it could work without the ``**kwargs`` option)
"""
[docs]def initial_cancer_discovery():
"""
A Neo4J Cypher Statment that queries wikidata for Human Cancers. Since using the "afflicts:human"
tag didnt have much use here, I used a simple workaround: Query wikidata for all humans, and, among them,
find all of this for which their cause of death was a subclass of "Cancer" (Q12078). Unfortunaltely,
some of them were diagnosed "Cancer" (Q12078), which is too general, so I removed it.
Args:
tx (neo4j.Session): The session under which the driver is running
Returns:
neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
"""
return ("""
WITH 'SELECT DISTINCT ?cause_of_death ?cause_of_death_name
WHERE {
?human wdt:P31 wd:Q5;
wdt:P509 ?cause_of_death.
?cause_of_death wdt:P279* wd:Q12078.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
?cause_of_death rdfs:label ?cause_of_death_name. }
}' AS sparql
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{ Accept: "application/sparql-results+json" }, null )
YIELD value
UNWIND value['results']['bindings'] as row
WITH row['cause_of_death_name']['value'] as label,
row['cause_of_death']['value'] as url,
split(row['cause_of_death']['value'],'/')[-1] as id
CREATE (c:Disease)
SET c.Name = label,
c.URL = url,
c.WikiData_ID = id
WITH c
MATCH (n:Disease)
WHERE n.WikiData_ID = "Q12078"
DETACH DELETE n
""")
[docs]def find_subclass_of_disease():
"""
A Neo4J Cypher Statment that queries wikidata for subclasses of "Disease" nodes already
present on the Database. Since these are expected to only affect humans, this subclasses
should also, only affect humans
Returns:
str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.
.. NOTE:: We are forcing c.WikiData_ID to not be null or "". This is not necessary if we are just building the wikidata database,
because there will always be a WikiData_ID, but it is useful in the rest of the cases
"""
return """
CALL {
MATCH (c:Disease)
WHERE c.WikiData_ID IS NOT NULL AND c.WikiData_ID <> ""
WITH 'SELECT DISTINCT ?cancer ?cancer_name
WHERE {
?cancer wdt:P279 wd:' + c.WikiData_ID + ' .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
?cancer rdfs:label ?cancer_name. }
}' AS sparql, c
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{ Accept: "application/sparql-results+json" }, null )
YIELD value
UNWIND value['results']['bindings'] as row
CREATE (s:Disease)
SET s.Name = row['cancer_name']['value'],
s.URL = row['cancer']['value'],
s.WikiData_ID = split(row['cancer']['value'],'/')[-1]
CREATE (s)-[:SUBCLASS_OF]->(c)
} IN TRANSACTIONS OF 10 rows
"""
[docs]def find_instance_of_disease():
"""
A Neo4J Cypher Statment that queries wikidata for instances of "Disease" nodes already
present on the Database. Since these are expected to only affect humans, this subclasses
should also, only affect humans
Returns:
str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.
.. NOTE:: We are forcing c.WikiData_ID to not be null or "". This is not necessary if we are just building the wikidata database,
because there will always be a WikiData_ID, but it is useful in the rest of the cases
"""
return """
CALL {
MATCH (c:Disease)
WHERE c.WikiData_ID IS NOT NULL AND c.WikiData_ID <> ""
WITH 'SELECT DISTINCT ?cancer ?cancer_name
WHERE {
?cancer wdt:P31 wd:' + c.WikiData_ID + ' .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
?cancer rdfs:label ?cancer_name. }
}' AS sparql, c
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{ Accept: "application/sparql-results+json" }, null )
YIELD value
UNWIND value['results']['bindings'] as row
CREATE (i:Disease)
SET i.Name = row['cancer_name']['value'],
i.URL = row['cancer']['value'],
i.WikiData_ID = split(row['cancer']['value'],'/')[-1]
CREATE (i)-[:INSTANCE_OF]->(c)
} IN TRANSACTIONS OF 10 rows
"""
[docs]def add_disease_info(number, **kwargs):
"""
Adds info to "Disease" nodes for which its WikiData_ID ends in a given number. This way, only some of the nodes
are targeted, and the Java Virtual Machine does not run out of memory
Args:
number (int): From 0 to 9, the number under which the WikiData_IDs to process should ends.
This allows us tho divide the work, although its not very elegant.
**kwargs: Any number of arbitrary keyword arguments
Returns:
str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.
.. NOTE:: Here, there is no need to force c.WikiData_ID to not be null
or "" because it will already be = ``number`` (and, thus, exist)
"""
return f"""
CALL {{
MATCH (c:Disease)
WHERE toFloat(split(c.WikiData_ID,"")[-1]) = toFloat({number})
WITH 'SELECT *
WHERE{{
filter (?item = wd:' + c.WikiData_ID + ')
OPTIONAL{{
?item wdt:P665 ?kegg_id }}
OPTIONAL{{
?item wdt:P699 ?Disease_Ont }}
OPTIONAL{{
?item wdt:P927 ?Anatomical_Location
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
?Anatomical_Location rdfs:label ?Anatomical_Location_name. }} }}
OPTIONAL{{
?item wdt:P3841 ?HPO_ID }}
OPTIONAL{{
?item wdt:P494 ?ICD_1O }}
OPTIONAL{{
?item wdt:P7807 ?ICD_11 }}
OPTIONAL{{
?item wdt:P492 ?OMIM_ID }}
}}' AS sparql, c
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{{ Accept: "application/sparql-results+json"}}, null)
YIELD value
UNWIND value['results']['bindings'] as row
SET c.KEGG_ID = row['kegg_id']['value'],
c.Disease_Ontology_ID = row['Disease_Ont']['value'],
c.Anatomical_Location = row['Anatomical_Location_name']['value'],
c.HPO_ID = row['HPO_ID']['value'],
c.ICD_1O = row['ICD_1O']['value'],
c.OMIM_ID = row['OMIM_ID']['value'],
c.ICD_11 = row['ICD_11']['value']
}} IN TRANSACTIONS OF 10 rows
"""
[docs]def add_drugs(number, **kwargs):
"""
Creates drug nodes related with each of the "Cancer" nodes already on the database
Args:
number (int): From 0 to 9, the number under which the WikiData_IDs to process should ends.
This allows us tho divide the work, although its not very elegant.
**kwargs: Any number of arbitrary keyword arguments
Returns:
str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.
.. NOTE:: Here, there is no need to force c.WikiData_ID to not be null
or "" because it will already be = ``number`` (and, thus, exist)
"""
return f"""
CALL {{
MATCH (c:Disease)
WHERE toFloat(split(c.WikiData_ID,"")[-1]) = toFloat({number})
WITH 'SELECT *
WHERE{{
filter (?item = wd:' + c.WikiData_ID + ')
OPTIONAL{{
?item wdt:P2176 ?Drugs
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
?Drugs rdfs:label ?DrugName. }} }}
OPTIONAL{{
?item wdt:P2888 ?Exact_Matches }}
}}' AS sparql, c
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{{ Accept: "application/sparql-results+json"}}, null)
YIELD value
UNWIND value['results']['bindings'] as row
FOREACH(ignoreMe IN CASE WHEN row['Drugs'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (d:Drug{{URL:row['Drugs']['value'],
WikiData_ID:split(row['Drugs']['value'],'/')[-1],
Name:row['DrugName']['value'] }} )
MERGE (c)-[:TARGETED_BY]->(d))
FOREACH(ignoreMe IN CASE WHEN row['Exact_Matches'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (e:ExternalEquivalent{{URL:row['Exact_Matches']['value'],
WikiData_ID:split(row['Exact_Matches']['value'],'/')[-1]}})
MERGE (c)-[:EQUALS]-(e))
}} IN TRANSACTIONS OF 10 rows
"""
[docs]def add_causes(number, **kwargs):
"""
Creates drug nodes related with each of the "Cancer" nodes already on the database
Args:
number (int): From 0 to 9, the number under which the WikiData_IDs to process should ends.
This allows us tho divide the work, although its not very elegant.
**kwargs: Any number of arbitrary keyword arguments
Returns:
str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.
.. NOTE:: Here, there is no need to force c.WikiData_ID to not be null
or "" because it will already be = ``number`` (and, thus, exist)
"""
return f"""
CALL {{
MATCH (c:Disease)
WHERE toFloat(split(c.WikiData_ID,"")[-1]) = toFloat({number})
WITH 'SELECT ?Causes ?CauseName
WHERE{{
filter (?item = wd:' + c.WikiData_ID + ')
OPTIONAL{{
?item wdt:P828 ?Causes
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
?Causes rdfs:label ?CauseName. }} }}
}}' AS sparql, c
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{{ Accept: "application/sparql-results+json"}}, null)
YIELD value
UNWIND value['results']['bindings'] as row
FOREACH(ignoreMe IN CASE WHEN row['Causes'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (ca:Cause{{URL:row['Causes']['value'],
WikiData_ID:split(row['Causes']['value'],'/')[-1],
Name:row['CauseName']['value'] }} )
MERGE (c)-[:CAUSED_BY]->(ca))
}} IN TRANSACTIONS OF 10 rows
"""
[docs]def add_genes(number, **kwargs):
"""
Creates gene nodes related with each of the "Cancer" nodes already on the database
Args:
number (int): From 0 to 9, the number under which the WikiData_IDs to process should ends.
This allows us tho divide the work, although its not very elegant.
**kwargs: Any number of arbitrary keyword arguments
Returns:
str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.
.. NOTE:: Here, there is no need to force c.WikiData_ID to not be null
or "" because it will already be = ``number`` (and, thus, exist)
"""
return f"""
CALL {{
MATCH (c:Disease)
WHERE toFloat(split(c.WikiData_ID,"")[-1]) = toFloat({number})
WITH 'SELECT *
WHERE{{
filter (?item = wd:' + c.WikiData_ID + ')
OPTIONAL{{
?item wdt:P2293 ?Genetic_Associations.
?Genetic_Associations wdt:P703 wd:Q15978631
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
?Genetic_Associations rdfs:label ?GeneName. }} }}
}}' AS sparql, c
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{{ Accept: "application/sparql-results+json"}}, null)
YIELD value
UNWIND value['results']['bindings'] as row
FOREACH(ignoreMe IN CASE WHEN row['Genetic_Associations'] is not null and row['GeneName'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (g:Gene{{URL:row['Genetic_Associations']['value'],
WikiData_ID:split(row['Genetic_Associations']['value'],'/')[-1],
Name:row['GeneName']['value'] }})
MERGE (g)-[:ASSOCIATED_DISEASE_GENE]->(c))
}} IN TRANSACTIONS OF 10 rows
"""
[docs]def add_drug_external_ids(query = "Wikidata_ID", **kwargs):
"""
Adds some external IDs to any "Drug" nodes already present on the database.
Since the PDB information had too much values which caused triple duplicates that overcharged the system,
they were intentionally left out.
Args:
query (str): One of ["DrugBank_ID","WikiData_ID"], a way to identify the nodes for which external IDs will be added.
**kwargs: Any number of arbitrary keyword arguments
Returns:
neo4j.Result:
A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
.. NOTE:: We are forcing c.WikiData_ID to not be null or "". This is not necessary if we are just building the wikidata database,
because there will always be a WikiData_ID, but it is useful in the rest of the cases
"""
if query == "DrugBank_ID":
query_text = [""" WHERE d.DrugBank_ID IS NOT NULL AND d.DrugBank_ID <> "" """,
"""?item (wdt:P715) \"' + apoc.text.replace(d.DrugBank_ID, "DB", "") + '\" """]
else:
query_text = ["""WHERE d.WikiData_ID IS NOT NULL AND d.WikiData_ID <> "" """,
"""filter (?item = wd:' + d.WikiData_ID + ')"""]
return f"""
CALL {{
MATCH (d:Drug)
{query_text[0]}
WITH 'SELECT DISTINCT *
WHERE {{
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }}
{query_text[1]}
OPTIONAL{{
?item wdt:P715 ?DrugBank }}
OPTIONAL{{
?item wdt:P683 ?ChEBI_ID }}
OPTIONAL{{
?item wdt:P592 ?CHEMBL_ID }}
OPTIONAL{{
?item wdt:P661 ?ChemSpider_ID }}
OPTIONAL{{
?item wdt:P662 ?PubChem_ID }}
OPTIONAL{{
?item wdt:P665 ?KEGG_ID }}
OPTIONAL{{
?item wdt:P231 ?CAS_Number }}
OPTIONAL{{
?item wdt:P267 ?ATC_Code }}
OPTIONAL{{
?item wdt:P2175 ?Medical_condition_treated
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
?Medical_condition_treated rdfs:label ?Medical_condition_treated_name. }} }}
}}' AS sparql, d
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{{ Accept: "application/sparql-results+json" }}, null )
YIELD value
UNWIND value['results']['bindings'] as row
SET d.WikiData_ID = split(row['item']['value'],'/')[-1],
d.CHEMBL_ID = row['CHEMBL_ID']['value'],
d.ChEBI_ID = row['ChEBI_ID']['value'],
d.PubChem_ID = row['PubChem_ID']['value'],
d.KEGG_ID = row['KEGG_ID']['value'],
d.DrugBank_ID = row['DrugBank']['value'],
d.CAS_Number = row['CAS_Number']['value'],
d.ChemSpider_ID = row['ChemSpider_ID']['value']
FOREACH(ignoreMe IN CASE WHEN row['ATC_Code'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (pri:ATC {{ Code:row['ATC_Code']['value'] }})
MERGE (d)-[r:RELATED_ATC]->(pri)
)
FOREACH(ignoreMe IN CASE WHEN row['MeSH_Descriptor_Name'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (c:MeSH {{ MeSH_ID:row['MeSH_Descriptor_ID']['value'] ,
Type:"Descriptor", Name: row['MeSH_Descriptor_Name']['value']}})
MERGE (d)-[r2:RELATED_MESH]->(c)
)
FOREACH(ignoreMe IN CASE WHEN row['MeSH_Concept_Name'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (cc:MeSH {{ MeSH_ID:row['MeSH_Concept_ID']['value'] ,
Type:"Concept", Name: row['MeSH_Concept_Name']['value']}})
MERGE (d)-[r3:RELATED_MESH]->(cc)
)
}} IN TRANSACTIONS OF 10 rows
"""
[docs]def add_more_drug_info(query = "WikiData_ID", **kwargs):
"""
Creates some nodes that are related with each of the "Drug" nodes already existing
on the database: routes of administration, targeted metabolites and approved drugs
that tehy are been used in
Args:
query (str): One of ["DrugBank_ID","WikiData_ID"], a way to identify the nodes for
which external IDs will be added; default is "WikiData_ID"
**kwargs: Any number of arbitrary keyword arguments
Returns:
str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.
.. TODO:: ADD ROLE to metabolite interactions
.. NOTE:: This transaction has been separated in order to keep response times low
.. NOTE:: We are forcing c.WikiData_ID to not be null or "". This is not necessary if we are just building the wikidata database,
because there will always be a WikiData_ID, but it is useful in the rest of the cases
"""
if query == "DrugBank_ID":
query_text = [""" WHERE d.DrugBank_ID IS NOT NULL AND d.DrugBank_ID <> "" """,
"""?item (wdt:P715) \"' + apoc.text.replace(d.DrugBank_ID, "DB", "") + '\" """]
else:
query_text = ["""WHERE d.WikiData_ID IS NOT NULL AND d.WikiData_ID <> "" """,
"""filter (?item = wd:' + d.WikiData_ID + ')"""]
return f"""
CALL {{
MATCH (d:Drug)
{query_text[0]}
WITH 'SELECT DISTINCT *
WHERE {{
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }}
{query_text[1]}
OPTIONAL{{
?item wdt:P234 ?InChI }}
OPTIONAL{{
?item wdt:P235 ?InChIKey }}
OPTIONAL{{
?item wdt:P2067 ?Mass }}
OPTIONAL{{
?item wdt:P274 ?Chemical_Formula }}
OPTIONAL{{
?item wdt:P233 ?Canonical_Smiles }}
OPTIONAL{{
?item wdt:P7830 ?LiverTox }}
}}' AS sparql, d
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{{ Accept: "application/sparql-results+json" }}, null )
YIELD value
UNWIND value['results']['bindings'] as row
SET d.WikiData_ID = split(row['item']['value'],'/')[-1],
d.InChI = row['InChI']['value'],
d.InChIKey = row['InChIKey']['value'],
d.Average_Mass = row['Mass']['value'],
d.Formula = row['Chemical_Formula']['value'],
d.SMILES = row['Canonical_Smiles']['value'],
d.Pregnancy_Category = row['Pregnancy_Category_Name']['value'],
d.LiverTox = row['LiverTox']['value']
}} IN TRANSACTIONS OF 10 rows
"""
[docs]def add_yet_more_drug_info(query = "WikiData_ID", **kwargs):
"""
Creates some nodes that are related with each of the "Drug" nodes already existing
on the database: routes of administration, targeted metabolites and approved drugs
that tehy are been used in
Args:
query (str): One of ["DrugBank_ID","WikiData_ID"], a way to identify the nodes for
which external IDs will be added; default is "WikiData_ID"
**kwargs: Any number of arbitrary keyword arguments
Returns:
str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.
"""
if query == "DrugBank_ID":
query_text = [""" WHERE d.DrugBank_ID IS NOT NULL AND d.DrugBank_ID <> "" """,
"""?item (wdt:P715) ' + apoc.text.replace(d.DrugBank_ID, "DB", "") + '"""]
else:
query_text = ["""WHERE d.WikiData_ID IS NOT NULL AND d.WikiData_ID <> "" """,
"""filter (?item = wd:' + d.WikiData_ID + ')"""]
return f"""
CALL {{
MATCH (d:Drug)
{query_text[0]}
WITH 'SELECT DISTINCT *
WHERE {{
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }}
{query_text[1]}
OPTIONAL{{
?item wdt:P636 ?Route_of_Admin
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
?Route_of_Admin rdfs:label ?Route_of_Admin_name. }} }}
OPTIONAL{{
?item wdt:P3780 ?Active_ingredient_in
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
?Active_ingredient_in rdfs:label ?Active_ingredient_in_name. }} }}
OPTIONAL{{
?item wdt:P129 ?Interacts_with
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
?Interacts_with rdfs:label ?Interacts_with_name. }} }}
}}' AS sparql, d
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{{ Accept: "application/sparql-results+json"}}, null)
YIELD value
UNWIND value['results']['bindings'] as row
SET d.WikiData_ID = split(row['item']['value'],'/')[-1]
FOREACH(ignoreMe IN CASE WHEN row['Interacts_with'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (m:Metabolite{{URL:row['Interacts_with']['value'],
WikiData_ID:split(row['Interacts_with']['value'],'/')[-1],
Name:row['Interacts_with_name']['value'] }})
MERGE (d)-[:INTERACTS_WITH]-(m))
FOREACH(ignoreMe IN CASE WHEN row['Route_of_Admin'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (r:AdministrationRoute{{URL:row['Route_of_Admin']['value'],
WikiData_ID:split(row['Route_of_Admin']['value'],'/')[-1],
Name:row['Route_of_Admin_name']['value'] }})
MERGE (d)-[:ADMINISTERED_VIA]->(r))
FOREACH(ignoreMe IN CASE WHEN row['Active_ingredient_in'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (me:Product{{URL:row['Active_ingredient_in']['value'],
WikiData_ID:split(row['Active_ingredient_in']['value'],'/')[-1],
Name:row['Active_ingredient_in_name']['value'] }})
MERGE (d)-[:PART_OF_PRODUCT]->(me))
}} IN TRANSACTIONS OF 10 rows
"""
[docs]def add_gene_info():
"""
A Cypher Query that adds some external IDs and properties to "Gene" nodes already existing on
the database. This query forces the genes to have a "found_in_taxon:homo_sapiens" label. This means
that any non-human genes will not be annotated (.. TODO:: delete those)
Returns:
str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.
.. NOTE:: Genomic Start and ends keep just the 2nd position, as reported in wikidata
.. NOTE:: We are forcing c.WikiData_ID to not be null or "". This is not necessary if we are just building the wikidata database,
because there will always be a WikiData_ID, but it is useful in the rest of the cases
.. TODO:: Might include P684 "Orthologues" for more info (it crashed java)
"""
return """
CALL {
MATCH (g:Gene)
WHERE g.WikiData_ID IS NOT NULL AND g.WikiData_ID <> ""
WITH 'SELECT *
WHERE{
filter (?item = wd:' + g.WikiData_ID + ')
?item wdt:P703 wd:Q15978631
OPTIONAL{
?item wdt:P351 ?Entrez_ID }
OPTIONAL{
?item wdt:P594 ?Ensembl_Gene_ID }
OPTIONAL{
?item wdt:P354 ?HGNC_ID }
OPTIONAL{
?item wdt:P492 ?OMIM_ID }
OPTIONAL{
?item wdt:P1057 ?Chromosome_Location
SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
?Chromosome_Location rdfs:label ?Chromosome_Location_name. } }
OPTIONAL{
?item wdt:P2548 ?Strand_Orientation
SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
?Strand_Orientation rdfs:label ?Strand_Orientation_name. } }
OPTIONAL{
?item wdt:P644 ?Genomic_Start }
OPTIONAL{
?item wdt:P645 ?Genomic_End }
OPTIONAL{
?item wdt:P4196 ?Cytogenetic_Location }
OPTIONAL{
?item wdt:P593 ?HomoloGene_ID }
OPTIONAL{
?item wdt:P688 ?Encodes
SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
?Encodes rdfs:label ?Encodes_name. } }
OPTIONAL{
?item wdt:P5572 ?Expressed_in
SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
?Expressed_in rdfs:label ?Expressed_in_name. } }
OPTIONAL{
?item wdt:P2888 ?Exact_Matches }
}' AS sparql, g
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{ Accept: "application/sparql-results+json" }, null )
YIELD value
UNWIND value['results']['bindings'] as row
SET g.Entrez_ID = row['Entrez_ID']['value'],
g.Ensembl_Gene_ID = row['Ensembl_Gene_ID']['value'],
g.HGNC_ID = row['HGNC_ID']['value'],
g.OMIM_ID = row['OMIM_ID']['value'],
g.HomoloGene_ID = row['HomoloGene_ID']['value'],
g.Cytogenetic_Location = row['Cytogenetic_Location']['value'],
g.Chromosome_Location = row['Chromosome_Location_name']['value'],
g.Strand_Orientation = row['Strand_Orientation_name']['value'],
g.Genomic_Start = row['Genomic_Start']['value'],
g.Genomic_End = row['Genomic_End']['value']
FOREACH(ignoreMe IN CASE WHEN row['Encodes'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (m:Metabolite{URL:row['Encodes']['value'],
WikiData_ID:split(row['Encodes']['value'],'/')[-1],
Name:row['Encodes_name']['value'] })
MERGE (g)-[:ENCODES]->(m))
FOREACH(ignoreMe IN CASE WHEN row['Expressed_in'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (t:Tissue{URL:row['Expressed_in']['value'],
WikiData_ID:split(row['Expressed_in']['value'],'/')[-1],
Name:row['Expressed_in_name']['value'] })
MERGE (g)-[:EXPRESSED_IN]->(t))
FOREACH(ignoreMe IN CASE WHEN row['Exact_Matches'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (e:ExternalEquivalent{URL:row['Exact_Matches']['value'],
WikiData_ID:split(row['Exact_Matches']['value'],'/')[-1]})
MERGE (g)-[:EQUALS]-(e))
} IN TRANSACTIONS OF 10 rows
"""
[docs]def add_wikidata_and_mesh_by_name():
"""
A function that adds some MeSH nodes and WikiData_IDs
to existing nodes, based on their Wikipedia Article Title.
Args:
tx (neo4j.Session): The session under which the driver is running
Returns:
neo4j.Result:
A Neo4J connexion to the database that modifies it according
to the CYPHER statement contained in the function.
"""
return """
CALL {
MATCH (d)
WHERE (d:AdministrationRoute OR d:Cause OR d:Disease OR d:Drug OR d:ExternalEquivalent
OR d:Gene OR d:Metabolite OR d:Product OR d:Tissue)
AND (d.Name IS NOT null)
WITH 'SELECT DISTINCT *
WHERE{
?sitelink schema:about ?WikiData_ID;
schema:isPartOf <https://en.wikipedia.org/>;
schema:name \"' + replace(d.Name, " ", " ") + '\"@en.
OPTIONAL{
?WikiData_ID p:P486 ?statement1 .
?statement1 ps:P486 ?MeSH_Descriptor_ID .
?statement1 pq:P1810 ?MeSH_Descriptor_Name .
}
OPTIONAL{
?WikiData_ID p:P6694 ?statement2 .
?statement2 ps:P6694 ?MeSH_Concept_ID .
?statement2 pq:P1810 ?MeSH_Concept_Name . }
}' AS sparql, d
CALL apoc.load.jsonParams(
replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
{ Accept: "application/sparql-results+json" }, null )
YIELD value
UNWIND value['results']['bindings'] as row
SET d.WikiData_ID = split(row['WikiData_ID']['value'],'/')[-1]
FOREACH(ignoreMe IN CASE WHEN row['MeSH_Descriptor_Name'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (c:MeSH { MeSH_ID:row['MeSH_Descriptor_ID']['value'] ,
Type:"Descriptor", Name: row['MeSH_Descriptor_Name']['value']})
MERGE (d)-[r:RELATED_MESH]->(c)
)
FOREACH(ignoreMe IN CASE WHEN row['MeSH_Concept_Name'] IS NOT NULL THEN [1] ELSE [] END |
MERGE (cc:MeSH { MeSH_ID:row['MeSH_Concept_ID']['value'] ,
Type:"Concept", Name: row['MeSH_Concept_Name']['value']})
MERGE (d)-[r2:RELATED_MESH]->(cc)
)
} IN TRANSACTIONS OF 10 rows
"""