Source code for CanGraph.QueryWikidata.build_database

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# SPDX-FileCopyrightText: 2022 Pablo Marcos <software@loreak.org>
#
# SPDX-License-Identifier: MIT

"""
A python module that provides the necessary functions to transition selected parts of the Wikidata database to graph format,
either from scratch importing all the nodes (as showcased in :obj:`CanGraph.QueryWikidata.main`) or in a case-by-case basis,
to annotate existing metabolites (as showcased in :obj:`CanGraph.main`).

.. NOTE:: You may notice some functions here present the ``**kwargs`` arguments option.
    This is in order to make the functions compatible with the
    :obj:`CanGraph.miscelaneous.manage_transaction` function, which might send back a variable
    number of arguments (although technically it could work without the ``**kwargs`` option)
"""

[docs]def initial_cancer_discovery():
    """
    A Neo4J Cypher Statment that queries wikidata for Human Cancers. Since using the "afflicts:human"
    tag didnt have much use here, I used a simple workaround: Query wikidata for all humans, and, among them,
    find all of this for which their cause of death was a subclass of "Cancer" (Q12078). Unfortunaltely,
    some of them were diagnosed "Cancer" (Q12078), which is too general, so I removed it.

    Args:
        tx          (neo4j.Session): The session under which the driver is running

    Returns:
        neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.
    """
    return ("""
        WITH 'SELECT DISTINCT ?cause_of_death ?cause_of_death_name
        WHERE {
                ?human wdt:P31 wd:Q5;
                wdt:P509 ?cause_of_death.
                ?cause_of_death wdt:P279* wd:Q12078.
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
                                 ?cause_of_death rdfs:label ?cause_of_death_name. }
                }' AS sparql

        CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                { Accept: "application/sparql-results+json" }, null )
        YIELD value

        UNWIND value['results']['bindings'] as row
        WITH row['cause_of_death_name']['value'] as label,
            row['cause_of_death']['value'] as url,
            split(row['cause_of_death']['value'],'/')[-1] as id

        CREATE (c:Disease)
        SET c.Name = label,
            c.URL = url,
            c.WikiData_ID = id

        WITH c
        MATCH (n:Disease)
            WHERE n.WikiData_ID = "Q12078"
        DETACH DELETE n
        """)

[docs]def find_subclass_of_disease():
    """
    A Neo4J Cypher Statment that queries wikidata for subclasses of "Disease" nodes already
    present on the Database. Since these are expected to only affect humans, this subclasses
    should also, only affect humans

    Returns:
        str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.

    .. NOTE:: We are forcing c.WikiData_ID to not be null or "". This is not necessary if we are just building the wikidata database,
        because there will always be a WikiData_ID, but it is useful in the rest of the cases
    """
    return """
        CALL {
        MATCH (c:Disease)
        WHERE c.WikiData_ID IS NOT NULL AND c.WikiData_ID <> ""
        WITH 'SELECT DISTINCT ?cancer ?cancer_name
        WHERE {
                ?cancer wdt:P279 wd:' + c.WikiData_ID + ' .
                SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
                                 ?cancer rdfs:label ?cancer_name. }
                }' AS sparql, c

        CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                { Accept: "application/sparql-results+json" }, null )
        YIELD value

        UNWIND value['results']['bindings'] as row

        CREATE (s:Disease)
        SET s.Name = row['cancer_name']['value'],
            s.URL = row['cancer']['value'],
            s.WikiData_ID = split(row['cancer']['value'],'/')[-1]

        CREATE (s)-[:SUBCLASS_OF]->(c)
        } IN TRANSACTIONS OF 10 rows
        """

[docs]def find_instance_of_disease():
    """
    A Neo4J Cypher Statment that queries wikidata for instances of "Disease" nodes already
    present on the Database. Since these are expected to only affect humans, this subclasses
    should also, only affect humans

    Returns:
        str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.

    .. NOTE:: We are forcing c.WikiData_ID to not be null or "". This is not necessary if we are just building the wikidata database,
        because there will always be a WikiData_ID, but it is useful in the rest of the cases
    """
    return """
        CALL {
        MATCH (c:Disease)
        WHERE c.WikiData_ID IS NOT NULL AND c.WikiData_ID <> ""
        WITH 'SELECT DISTINCT ?cancer ?cancer_name
        WHERE {
                ?cancer wdt:P31 wd:' + c.WikiData_ID + ' .
                SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
                                 ?cancer rdfs:label ?cancer_name. }
                }' AS sparql, c

        CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                { Accept: "application/sparql-results+json" }, null )
        YIELD value

        UNWIND value['results']['bindings'] as row

        CREATE (i:Disease)
        SET i.Name = row['cancer_name']['value'],
            i.URL = row['cancer']['value'],
            i.WikiData_ID = split(row['cancer']['value'],'/')[-1]

        CREATE (i)-[:INSTANCE_OF]->(c)
        } IN TRANSACTIONS OF 10 rows
        """

[docs]def add_disease_info(number, **kwargs):
    """
    Adds info to "Disease" nodes for which its WikiData_ID ends in a given number. This way, only some of the nodes
    are targeted, and the Java Virtual Machine does not run out of memory

    Args:
        number (int): From 0 to 9, the number under which the WikiData_IDs to process should ends.
            This allows us tho divide the work, although its not very elegant.
        **kwargs: Any number of arbitrary keyword arguments

    Returns:
        str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.

    .. NOTE:: Here, there is no need to force c.WikiData_ID to not be null
        or "" because it will already be = ``number`` (and, thus, exist)
    """
    return f"""
        CALL {{
        MATCH (c:Disease)
            WHERE toFloat(split(c.WikiData_ID,"")[-1]) = toFloat({number})
        WITH 'SELECT *
            WHERE{{
                filter (?item = wd:' + c.WikiData_ID + ')

            OPTIONAL{{
                ?item wdt:P665 ?kegg_id }}
            OPTIONAL{{
                ?item wdt:P699 ?Disease_Ont }}
            OPTIONAL{{
                ?item wdt:P927 ?Anatomical_Location
                SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
                                         ?Anatomical_Location rdfs:label ?Anatomical_Location_name. }} }}
            OPTIONAL{{
                ?item wdt:P3841 ?HPO_ID }}
            OPTIONAL{{
                ?item wdt:P494 ?ICD_1O }}
            OPTIONAL{{
                ?item wdt:P7807 ?ICD_11 }}
            OPTIONAL{{
                ?item wdt:P492 ?OMIM_ID }}
            }}' AS sparql, c

            CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                {{ Accept: "application/sparql-results+json"}}, null)
        YIELD value

        UNWIND value['results']['bindings'] as row

        SET c.KEGG_ID = row['kegg_id']['value'],
            c.Disease_Ontology_ID = row['Disease_Ont']['value'],
            c.Anatomical_Location = row['Anatomical_Location_name']['value'],
            c.HPO_ID = row['HPO_ID']['value'],
            c.ICD_1O = row['ICD_1O']['value'],
            c.OMIM_ID = row['OMIM_ID']['value'],
            c.ICD_11 = row['ICD_11']['value']

        }} IN TRANSACTIONS OF 10 rows
        """

[docs]def add_drugs(number, **kwargs):
    """
    Creates drug nodes related with each of the "Cancer" nodes already on the database

    Args:
        number (int): From 0 to 9, the number under which the WikiData_IDs to process should ends.
            This allows us tho divide the work, although its not very elegant.
        **kwargs: Any number of arbitrary keyword arguments

    Returns:
        str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.

    .. NOTE:: Here, there is no need to force c.WikiData_ID to not be null
        or "" because it will already be = ``number`` (and, thus, exist)
    """
    return f"""
        CALL {{

        MATCH (c:Disease)
            WHERE toFloat(split(c.WikiData_ID,"")[-1]) = toFloat({number})
        WITH 'SELECT *
            WHERE{{
                filter (?item = wd:' + c.WikiData_ID + ')

            OPTIONAL{{
                ?item wdt:P2176 ?Drugs
                SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
                                         ?Drugs rdfs:label ?DrugName. }} }}
            OPTIONAL{{
                ?item wdt:P2888 ?Exact_Matches }}
            }}' AS sparql, c

            CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                {{ Accept: "application/sparql-results+json"}}, null)
        YIELD value

        UNWIND value['results']['bindings'] as row

        FOREACH(ignoreMe IN CASE WHEN row['Drugs'] IS NOT NULL THEN [1] ELSE [] END |
                MERGE (d:Drug{{URL:row['Drugs']['value'],
                               WikiData_ID:split(row['Drugs']['value'],'/')[-1],
                               Name:row['DrugName']['value'] }} )
                MERGE (c)-[:TARGETED_BY]->(d))

        FOREACH(ignoreMe IN CASE WHEN row['Exact_Matches'] IS NOT NULL THEN [1] ELSE [] END |
                MERGE (e:ExternalEquivalent{{URL:row['Exact_Matches']['value'],
                                            WikiData_ID:split(row['Exact_Matches']['value'],'/')[-1]}})
                MERGE (c)-[:EQUALS]-(e))

        }} IN TRANSACTIONS OF 10 rows
        """

[docs]def add_causes(number, **kwargs):
    """
    Creates drug nodes related with each of the "Cancer" nodes already on the database

    Args:
        number (int): From 0 to 9, the number under which the WikiData_IDs to process should ends.
            This allows us tho divide the work, although its not very elegant.
        **kwargs: Any number of arbitrary keyword arguments

    Returns:
        str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.

    .. NOTE:: Here, there is no need to force c.WikiData_ID to not be null
        or "" because it will already be = ``number`` (and, thus, exist)
    """
    return f"""
        CALL {{

        MATCH (c:Disease)
            WHERE toFloat(split(c.WikiData_ID,"")[-1]) = toFloat({number})
        WITH 'SELECT ?Causes ?CauseName
            WHERE{{
                filter (?item = wd:' + c.WikiData_ID + ')

            OPTIONAL{{
                ?item wdt:P828 ?Causes
                SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
                                         ?Causes rdfs:label ?CauseName. }} }}
            }}' AS sparql, c

            CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                {{ Accept: "application/sparql-results+json"}}, null)
        YIELD value

        UNWIND value['results']['bindings'] as row

        FOREACH(ignoreMe IN CASE WHEN row['Causes'] IS NOT NULL THEN [1] ELSE [] END |
                MERGE (ca:Cause{{URL:row['Causes']['value'],
                                WikiData_ID:split(row['Causes']['value'],'/')[-1],
                                Name:row['CauseName']['value'] }} )
                MERGE (c)-[:CAUSED_BY]->(ca))

        }} IN TRANSACTIONS OF 10 rows
        """

[docs]def add_genes(number, **kwargs):
    """
    Creates gene nodes related with each of the "Cancer" nodes already on the database

    Args:
        number (int): From 0 to 9, the number under which the WikiData_IDs to process should ends.
            This allows us tho divide the work, although its not very elegant.
        **kwargs: Any number of arbitrary keyword arguments

    Returns:
        str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.

    .. NOTE:: Here, there is no need to force c.WikiData_ID to not be null
        or "" because it will already be = ``number`` (and, thus, exist)
    """
    return f"""
        CALL {{

        MATCH (c:Disease)
            WHERE toFloat(split(c.WikiData_ID,"")[-1]) = toFloat({number})
        WITH 'SELECT *
            WHERE{{
                filter (?item = wd:' + c.WikiData_ID + ')

            OPTIONAL{{
                ?item wdt:P2293 ?Genetic_Associations.
                ?Genetic_Associations wdt:P703 wd:Q15978631
                SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
                                         ?Genetic_Associations rdfs:label ?GeneName. }} }}
            }}' AS sparql, c

            CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                {{ Accept: "application/sparql-results+json"}}, null)
        YIELD value

        UNWIND value['results']['bindings'] as row

        FOREACH(ignoreMe IN CASE WHEN row['Genetic_Associations'] is not null and row['GeneName'] IS NOT NULL THEN [1] ELSE [] END |
                MERGE (g:Gene{{URL:row['Genetic_Associations']['value'],
                              WikiData_ID:split(row['Genetic_Associations']['value'],'/')[-1],
                              Name:row['GeneName']['value'] }})
                MERGE (g)-[:ASSOCIATED_DISEASE_GENE]->(c))

        }} IN TRANSACTIONS OF 10 rows
        """

[docs]def add_drug_external_ids(query = "Wikidata_ID", **kwargs):
    """
    Adds some external IDs to any "Drug" nodes already present on the database.
    Since the PDB information had too much values which caused triple duplicates that overcharged the system,
    they were intentionally left out.

    Args:
        query (str): One of ["DrugBank_ID","WikiData_ID"], a way to identify the nodes for which external IDs will be added.
        **kwargs: Any number of arbitrary keyword arguments

    Returns:
        neo4j.Result:
            A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function.

    .. NOTE:: We are forcing c.WikiData_ID to not be null or "". This is not necessary if we are just building the wikidata database,
        because there will always be a WikiData_ID, but it is useful in the rest of the cases
    """
    if query == "DrugBank_ID":
      query_text = [""" WHERE d.DrugBank_ID IS NOT NULL AND d.DrugBank_ID <> "" """,
                    """?item (wdt:P715) \"' + apoc.text.replace(d.DrugBank_ID, "DB", "") + '\" """]
    else:
      query_text = ["""WHERE d.WikiData_ID IS NOT NULL AND d.WikiData_ID <> "" """,
                    """filter (?item = wd:' + d.WikiData_ID + ')"""]


    return f"""
        CALL {{

        MATCH (d:Drug)
        {query_text[0]}
        WITH 'SELECT DISTINCT *
        WHERE {{
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }}

        {query_text[1]}

        OPTIONAL{{
            ?item wdt:P715 ?DrugBank }}

        OPTIONAL{{
            ?item wdt:P683 ?ChEBI_ID }}
        OPTIONAL{{
            ?item wdt:P592 ?CHEMBL_ID }}
        OPTIONAL{{
            ?item wdt:P661 ?ChemSpider_ID }}
        OPTIONAL{{
            ?item wdt:P662 ?PubChem_ID }}
        OPTIONAL{{
            ?item wdt:P665 ?KEGG_ID }}
        OPTIONAL{{
            ?item wdt:P231 ?CAS_Number }}
        OPTIONAL{{
            ?item wdt:P267 ?ATC_Code }}

        OPTIONAL{{
            ?item wdt:P2175 ?Medical_condition_treated
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
                                    ?Medical_condition_treated rdfs:label ?Medical_condition_treated_name. }} }}

            }}' AS sparql, d

            CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                {{ Accept: "application/sparql-results+json" }}, null )
        YIELD value

        UNWIND value['results']['bindings'] as row

        SET d.WikiData_ID = split(row['item']['value'],'/')[-1],
            d.CHEMBL_ID = row['CHEMBL_ID']['value'],
            d.ChEBI_ID = row['ChEBI_ID']['value'],
            d.PubChem_ID = row['PubChem_ID']['value'],
            d.KEGG_ID = row['KEGG_ID']['value'],
            d.DrugBank_ID = row['DrugBank']['value'],
            d.CAS_Number = row['CAS_Number']['value'],
            d.ChemSpider_ID = row['ChemSpider_ID']['value']

        FOREACH(ignoreMe IN CASE WHEN row['ATC_Code'] IS NOT NULL THEN [1] ELSE [] END |
            MERGE (pri:ATC {{ Code:row['ATC_Code']['value'] }})
            MERGE (d)-[r:RELATED_ATC]->(pri)
        )

        FOREACH(ignoreMe IN CASE WHEN row['MeSH_Descriptor_Name'] IS NOT NULL THEN [1] ELSE [] END |
            MERGE (c:MeSH {{ MeSH_ID:row['MeSH_Descriptor_ID']['value'] ,
                             Type:"Descriptor", Name: row['MeSH_Descriptor_Name']['value']}})
            MERGE (d)-[r2:RELATED_MESH]->(c)
        )

        FOREACH(ignoreMe IN CASE WHEN row['MeSH_Concept_Name'] IS NOT NULL THEN [1] ELSE [] END |
            MERGE (cc:MeSH {{ MeSH_ID:row['MeSH_Concept_ID']['value'] ,
                              Type:"Concept", Name: row['MeSH_Concept_Name']['value']}})
            MERGE (d)-[r3:RELATED_MESH]->(cc)
        )

        }} IN TRANSACTIONS OF 10 rows
        """

[docs]def add_more_drug_info(query = "WikiData_ID", **kwargs):
    """
    Creates some nodes that are related with each of the "Drug" nodes already existing
    on the database: routes of administration, targeted metabolites and approved drugs
    that tehy are been used in

    Args:
        query (str): One of ["DrugBank_ID","WikiData_ID"], a way to identify the nodes for
            which external IDs will be added; default is "WikiData_ID"
        **kwargs: Any number of arbitrary keyword arguments

    Returns:
        str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.


    .. TODO:: ADD ROLE to metabolite interactions

    .. NOTE:: This transaction has been separated in order to keep response times low

    .. NOTE:: We are forcing c.WikiData_ID to not be null or "". This is not necessary if we are just building the wikidata database,
        because there will always be a WikiData_ID, but it is useful in the rest of the cases
    """
    if query == "DrugBank_ID":
      query_text = [""" WHERE d.DrugBank_ID IS NOT NULL AND d.DrugBank_ID <> "" """,
                    """?item (wdt:P715) \"' + apoc.text.replace(d.DrugBank_ID, "DB", "") + '\" """]
    else:
      query_text = ["""WHERE d.WikiData_ID IS NOT NULL AND d.WikiData_ID <> "" """,
                    """filter (?item = wd:' + d.WikiData_ID + ')"""]


    return f"""
        CALL {{

        MATCH (d:Drug)
        {query_text[0]}
        WITH 'SELECT DISTINCT *
        WHERE {{
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }}

        {query_text[1]}

        OPTIONAL{{
            ?item wdt:P234 ?InChI }}
        OPTIONAL{{
            ?item wdt:P235 ?InChIKey }}
        OPTIONAL{{
            ?item wdt:P2067 ?Mass }}
        OPTIONAL{{
            ?item wdt:P274 ?Chemical_Formula }}
        OPTIONAL{{
            ?item wdt:P233 ?Canonical_Smiles }}
        OPTIONAL{{
            ?item wdt:P7830 ?LiverTox }}
            }}' AS sparql, d

            CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                {{ Accept: "application/sparql-results+json" }}, null )
        YIELD value

        UNWIND value['results']['bindings'] as row

        SET d.WikiData_ID = split(row['item']['value'],'/')[-1],
            d.InChI = row['InChI']['value'],
            d.InChIKey = row['InChIKey']['value'],
            d.Average_Mass = row['Mass']['value'],
            d.Formula = row['Chemical_Formula']['value'],
            d.SMILES = row['Canonical_Smiles']['value'],
            d.Pregnancy_Category = row['Pregnancy_Category_Name']['value'],
            d.LiverTox = row['LiverTox']['value']

        }} IN TRANSACTIONS OF 10 rows
        """

[docs]def add_yet_more_drug_info(query = "WikiData_ID", **kwargs):
    """
    Creates some nodes that are related with each of the "Drug" nodes already existing
    on the database: routes of administration, targeted metabolites and approved drugs
    that tehy are been used in

    Args:
        query (str): One of ["DrugBank_ID","WikiData_ID"], a way to identify the nodes for
            which external IDs will be added; default is "WikiData_ID"
        **kwargs: Any number of arbitrary keyword arguments

    Returns:
        str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.
    """
    if query == "DrugBank_ID":
      query_text = [""" WHERE d.DrugBank_ID IS NOT NULL AND d.DrugBank_ID <> "" """,
                    """?item (wdt:P715) ' + apoc.text.replace(d.DrugBank_ID, "DB", "") + '"""]
    else:
      query_text = ["""WHERE d.WikiData_ID IS NOT NULL AND d.WikiData_ID <> "" """,
                    """filter (?item = wd:' + d.WikiData_ID + ')"""]

    return f"""
        CALL {{

        MATCH (d:Drug)
        {query_text[0]}
        WITH 'SELECT DISTINCT *
        WHERE {{
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }}

        {query_text[1]}

        OPTIONAL{{
            ?item wdt:P636 ?Route_of_Admin
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
                                    ?Route_of_Admin rdfs:label ?Route_of_Admin_name. }} }}
        OPTIONAL{{
            ?item wdt:P3780 ?Active_ingredient_in
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
                                    ?Active_ingredient_in rdfs:label ?Active_ingredient_in_name. }} }}
        OPTIONAL{{
            ?item wdt:P129 ?Interacts_with
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
                                    ?Interacts_with rdfs:label ?Interacts_with_name. }} }}
            }}' AS sparql, d

            CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                {{ Accept: "application/sparql-results+json"}}, null)
        YIELD value

        UNWIND value['results']['bindings'] as row

        SET d.WikiData_ID = split(row['item']['value'],'/')[-1]

        FOREACH(ignoreMe IN CASE WHEN row['Interacts_with'] IS NOT NULL THEN [1] ELSE [] END |
            MERGE (m:Metabolite{{URL:row['Interacts_with']['value'],
                        WikiData_ID:split(row['Interacts_with']['value'],'/')[-1],
                        Name:row['Interacts_with_name']['value'] }})
            MERGE (d)-[:INTERACTS_WITH]-(m))

        FOREACH(ignoreMe IN CASE WHEN row['Route_of_Admin'] IS NOT NULL THEN [1] ELSE [] END |
            MERGE (r:AdministrationRoute{{URL:row['Route_of_Admin']['value'],
                        WikiData_ID:split(row['Route_of_Admin']['value'],'/')[-1],
                        Name:row['Route_of_Admin_name']['value'] }})
            MERGE (d)-[:ADMINISTERED_VIA]->(r))

        FOREACH(ignoreMe IN CASE WHEN row['Active_ingredient_in'] IS NOT NULL THEN [1] ELSE [] END |
            MERGE (me:Product{{URL:row['Active_ingredient_in']['value'],
                        WikiData_ID:split(row['Active_ingredient_in']['value'],'/')[-1],
                        Name:row['Active_ingredient_in_name']['value'] }})
            MERGE (d)-[:PART_OF_PRODUCT]->(me))

        }} IN TRANSACTIONS OF 10 rows
        """

[docs]def add_gene_info():
    """
    A Cypher Query that adds some external IDs and properties to "Gene" nodes already existing on
    the database. This query forces the genes to have a "found_in_taxon:homo_sapiens" label. This means
    that any non-human genes will not be annotated (.. TODO:: delete those)

    Returns:
        str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.

    .. NOTE:: Genomic Start and ends keep just the 2nd position, as reported in wikidata

    .. NOTE:: We are forcing c.WikiData_ID to not be null or "". This is not necessary if we are just building the wikidata database,
        because there will always be a WikiData_ID, but it is useful in the rest of the cases

    .. TODO:: Might include P684 "Orthologues" for more info (it crashed java)
    """
    return """
        CALL {

        MATCH (g:Gene)
        WHERE g.WikiData_ID IS NOT NULL AND g.WikiData_ID <> ""
        WITH 'SELECT *
            WHERE{
                filter (?item = wd:' + g.WikiData_ID + ')
                ?item wdt:P703 wd:Q15978631

            OPTIONAL{
                ?item wdt:P351 ?Entrez_ID }
            OPTIONAL{
                ?item wdt:P594 ?Ensembl_Gene_ID }
            OPTIONAL{
                ?item wdt:P354 ?HGNC_ID }
            OPTIONAL{
                ?item wdt:P492 ?OMIM_ID }
            OPTIONAL{
                ?item wdt:P1057 ?Chromosome_Location
                SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
                                        ?Chromosome_Location rdfs:label ?Chromosome_Location_name. } }
            OPTIONAL{
                ?item wdt:P2548 ?Strand_Orientation
                SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
                                        ?Strand_Orientation rdfs:label ?Strand_Orientation_name. } }
            OPTIONAL{
                ?item wdt:P644 ?Genomic_Start }
            OPTIONAL{
                ?item wdt:P645 ?Genomic_End }
            OPTIONAL{
                ?item wdt:P4196 ?Cytogenetic_Location  }
            OPTIONAL{
                ?item wdt:P593 ?HomoloGene_ID }

            OPTIONAL{
                ?item wdt:P688 ?Encodes
                SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
                                        ?Encodes rdfs:label ?Encodes_name. } }
            OPTIONAL{
                ?item wdt:P5572 ?Expressed_in
                SERVICE wikibase:label { bd:serviceParam wikibase:language "en".
                                        ?Expressed_in rdfs:label ?Expressed_in_name. } }
            OPTIONAL{
                ?item wdt:P2888 ?Exact_Matches }

            }' AS sparql, g

            CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                { Accept: "application/sparql-results+json" }, null )
        YIELD value

        UNWIND value['results']['bindings'] as row

        SET g.Entrez_ID = row['Entrez_ID']['value'],
            g.Ensembl_Gene_ID = row['Ensembl_Gene_ID']['value'],
            g.HGNC_ID = row['HGNC_ID']['value'],
            g.OMIM_ID = row['OMIM_ID']['value'],
            g.HomoloGene_ID = row['HomoloGene_ID']['value'],
            g.Cytogenetic_Location = row['Cytogenetic_Location']['value'],
            g.Chromosome_Location = row['Chromosome_Location_name']['value'],
            g.Strand_Orientation = row['Strand_Orientation_name']['value'],
            g.Genomic_Start = row['Genomic_Start']['value'],
            g.Genomic_End = row['Genomic_End']['value']

        FOREACH(ignoreMe IN CASE WHEN row['Encodes'] IS NOT NULL THEN [1] ELSE [] END |
                    MERGE (m:Metabolite{URL:row['Encodes']['value'],
                                WikiData_ID:split(row['Encodes']['value'],'/')[-1],
                                Name:row['Encodes_name']['value'] })
                    MERGE (g)-[:ENCODES]->(m))

        FOREACH(ignoreMe IN CASE WHEN row['Expressed_in'] IS NOT NULL THEN [1] ELSE [] END |
                    MERGE (t:Tissue{URL:row['Expressed_in']['value'],
                                WikiData_ID:split(row['Expressed_in']['value'],'/')[-1],
                                Name:row['Expressed_in_name']['value'] })
                    MERGE (g)-[:EXPRESSED_IN]->(t))

        FOREACH(ignoreMe IN CASE WHEN row['Exact_Matches'] IS NOT NULL THEN [1] ELSE [] END |
                MERGE (e:ExternalEquivalent{URL:row['Exact_Matches']['value'],
                                            WikiData_ID:split(row['Exact_Matches']['value'],'/')[-1]})
                MERGE (g)-[:EQUALS]-(e))

        } IN TRANSACTIONS OF 10 rows
        """

[docs]def add_metabolite_info(query = "ChEBI_ID", **kwargs):
    """
    A Cypher Query that adds some external IDs and properties to "Metabolite" nodes
    already existing on the database. Two kind of metabolites exist: those that are
    encoded by a given gene, and those that interact with a given drug. Both are
    adressed here, since they are similar, and, most likely, instances of proteins.

    * This function forces all metabolites to have a "found_in_taxon:human" target
    * The metabolites are not forced to be proteins, but if they are, this is kept in the "instance_of" record

    Args:
        query (str): One of ["DrugBank_ID","WikiData_ID"], a way to identify the nodes for
            which external IDs will be added; default is "WikiData_ID"
        **kwargs: Any number of arbitrary keyword arguments

    Returns:
        str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.

    .. TODO:: Might include P527 "has part or parts" for more info (it crashed java)

    .. NOTE:: We are forcing c.WikiData_ID to not be null or "". This is not necessary if we are just building the wikidata database,
        because there will always be a WikiData_ID, but it is useful in the rest of the cases
    """
    if query == "ChEBI_ID":
      query_text = [""" WHERE m.ChEBI_ID IS NOT NULL AND m.ChEBI_ID <> "" """,
                    """?item (wdt:P683) \"' +  m.ChEBI_ID + '\" ."""]
    else:
      query_text = ["""WHERE m.WikiData_ID IS NOT NULL AND m.WikiData_ID <> "" """,
                    """filter (?item = wd:' + m.WikiData_ID + ')"""]

    return f"""
        CALL {{

        MATCH (m:Metabolite)
        {query_text[0]}
        WITH 'SELECT DISTINCT *
        WHERE {{
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }}

        {query_text[1]}

        OPTIONAL{{
            ?item wdt:P31 ?Instance_of
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
                                     ?Instance_of rdfs:label ?Instance_of_name. }} }}

        OPTIONAL{{
            ?item (wdt:P683) ?ChEBI_ID }}
        OPTIONAL{{
            ?item wdt:P231 ?CAS_Number }}

        OPTIONAL{{
            ?item wdt:P234 ?InChI }}
        OPTIONAL{{
            ?item wdt:P235 ?InChIKey }}
        OPTIONAL{{
            ?item wdt:P352 ?UniProt_ID }}
        OPTIONAL{{
            ?item wdt:P7260 ?Transporter_DB_ID }}
        OPTIONAL{{
            ?item wdt:P274 ?Chemical_Formula }}
        OPTIONAL{{
            ?item wdt:P592 ?CHEMBL_ID }}
        OPTIONAL{{
            ?item wdt:P662 ?PubChem_ID }}
        OPTIONAL{{
            ?item wdt:P233 ?Canonical_Smiles }}
        OPTIONAL{{
            ?item wdt:P8117 ?FooDB_Compound_ID }}

        OPTIONAL{{
            ?item p:P486 ?statement1 .
            ?statement1 ps:P486 ?MeSH_Descriptor_ID .
            ?statement1 pq:P1810 ?MeSH_Descriptor_Name .
            }}
        OPTIONAL{{
            ?item p:P6694 ?statement2 .
            ?statement2 ps:P6694 ?MeSH_Concept_ID .
            ?statement2 pq:P1810 ?MeSH_Concept_Name . }}

        OPTIONAL{{
            ?item wdt:P129 ?Interacts_with
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en".
                    ?Interacts_with rdfs:label ?Interacts_with_name. }} }}
        OPTIONAL{{
            ?item wdt:P2888 ?Exact_Matches }}

            }}' AS sparql, m

            CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                {{ Accept: "application/sparql-results+json"}}, null)
        YIELD value

        UNWIND value['results']['bindings'] as row

        SET m.WikiData_ID = split(row['item']['value'],'/')[-1],
            m.Function = row['Instance_of_name']['value'],
            m.UniProt_ID = row['UniProt_ID']['value'],
            m.ChEBI_ID = row['ChEBI_ID']['value'],
            m.CAS_Number = row['CAS_Number']['value'],
            m.Formula = row['Chemical_Formula']['value'],
            m.InChI = row['InChI']['value'],
            m.InChIKey = row['InChIKey']['value'],
            m.CHEMBL_ID = row['CHEMBL_ID']['value'],
            m.ChEBI_ID = row['ChEBI_ID']['value'],
            m.PubChem_ID = row['PubChem_ID']['value'],
            m.SMILES = row['Canonical_Smiles']['value'],
            m.FooDB_Compound_ID = row['FooDB_Compound_ID']['value'],
            m.Transporter_DB_ID = row['Transporter_DB_ID']['value']

        FOREACH(ignoreMe IN CASE WHEN row['MeSH_Descriptor_Name'] IS NOT NULL THEN [1] ELSE [] END |
            MERGE (c:MeSH {{ MeSH_ID:row['MeSH_Descriptor_ID']['value'] ,
                             Type:"Descriptor", Name: row['MeSH_Descriptor_Name']['value']}})
            MERGE (m)-[r:RELATED_MESH]->(c)
        )

        FOREACH(ignoreMe IN CASE WHEN row['MeSH_Concept_Name'] IS NOT NULL THEN [1] ELSE [] END |
            MERGE (cc:MeSH {{ MeSH_ID:row['MeSH_Concept_ID']['value'] ,
                              Type:"Concept", Name: row['MeSH_Concept_Name']['value']}})
            MERGE (m)-[r2:RELATED_MESH]->(cc)
        )

        FOREACH(ignoreMe IN CASE WHEN row['Interacts_with'] IS NOT NULL THEN [1] ELSE [] END |
                    MERGE (me:Metabolite{{URL:row['Interacts_with']['value'],
                                WikiData_ID:split(row['Interacts_with']['value'],'/')[-1],
                                Name:row['Interacts_with_name']['value'] }})
                    MERGE (m)-[:INTERACTS_WITH]-(me))

        FOREACH(ignoreMe IN CASE WHEN row['Exact_Matches'] IS NOT NULL THEN [1] ELSE [] END |
                MERGE (e:ExternalEquivalent{{URL:row['Exact_Matches']['value'],
                                            WikiData_ID:split(row['Exact_Matches']['value'],'/')[-1]}})
                MERGE (m)-[:EQUALS]-(e))

        }} IN TRANSACTIONS OF 10 rows
        """

[docs]def add_toomuch_metabolite_info():
    """
    A function that adds loads of info to existing "Metabolite" nodes. This was left out, first because
    it might be too much information, (specially when it is already availaible by clicking the "url" field),
    and because, due to it been so much, it crashes the JVM.

    Returns:
        str: A CYPHER query that modifies the DB according to the CYPHER statement contained in the function.
    """
    return """
        CALL {
        MATCH (m:Metabolite)
        WHERE m.WikiData_ID IS NOT NULL AND m.WikiData_ID <> ""
        WITH 'SELECT *
            WHERE{
                filter (?item = wd:' + m.WikiData_ID + ')
                ?item wdt:P703 wd:Q15978631

            OPTIONAL{
                ?item wdt:P680 ?Molecular_Function }
            OPTIONAL{
                ?item wdt:P681 ?Cell_Component }
            OPTIONAL{
                ?item wdt:P682 ?Biological_Process }
            OPTIONAL{
                ?item wdt:P705 ?Ensembl_Protein_ID }
            OPTIONAL{
                ?item wdt:P637 ?RefSeq_Protein_ID }
            OPTIONAL{
                ?item wdt:P638 ?PDB_Structure_ID }

        }' AS sparql, m

            CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                { Accept: "application/sparql-results+json" }, null )
        YIELD value

        UNWIND value['results']['bindings'] as row

        WITH m, row, CASE WHEN EXISTS(m.A) THEN m.A ELSE [] END AS nA
            WHERE NONE (x IN nA WHERE x = row['Molecular_Function']['value'])
        SET m.A = nA + row['Molecular_Function']['value']

        WITH m, row, CASE WHEN EXISTS(m.A) THEN m.A ELSE [] END AS nA
            WHERE NONE (x IN nA WHERE x = row['Cell_Component']['value'])
        SET m.A = nA + row['Cell_Component']['value']

        WITH m, row, CASE WHEN EXISTS(m.A) THEN m.A ELSE [] END AS nA
            WHERE NONE (x IN nA WHERE x = row['Biological_Process']['value'])
        SET m.A = nA + row['Biological_Process']['value']

        WITH m, row, CASE WHEN EXISTS(m.A) THEN m.A ELSE [] END AS nA
            WHERE NONE (x IN nA WHERE x = row['Ensembl_Protein_ID']['value'])
        SET m.A = nA + row['Ensembl_Protein_ID']['value']

        WITH m, row, CASE WHEN EXISTS(m.A) THEN m.A ELSE [] END AS nA
            WHERE NONE (x IN nA WHERE x = row['PDB_Structure_ID']['value'])
        SET m.A = nA + row['PDB_Structure_ID']['value']

        } IN TRANSACTIONS OF 10 rows
        """

[docs]def add_wikidata_and_mesh_by_name():
    """
    A function that adds some MeSH nodes and WikiData_IDs
    to existing nodes, based on their Wikipedia Article Title.

    Args:
        tx (neo4j.Session): The session under which the driver is running

    Returns:
        neo4j.Result:
            A Neo4J connexion to the database that modifies it according
            to the CYPHER statement contained in the function.
    """
    return """
        CALL {
        MATCH (d)
        WHERE (d:AdministrationRoute OR d:Cause OR d:Disease OR d:Drug OR d:ExternalEquivalent
                                     OR d:Gene OR d:Metabolite OR d:Product OR d:Tissue)
        AND (d.Name IS NOT null)
        WITH 'SELECT DISTINCT *
        WHERE{
            ?sitelink schema:about ?WikiData_ID;
                    schema:isPartOf <https://en.wikipedia.org/>;
                    schema:name \"' +  replace(d.Name, "  ", " ") + '\"@en.
        OPTIONAL{
            ?WikiData_ID p:P486 ?statement1 .
            ?statement1 ps:P486 ?MeSH_Descriptor_ID .
            ?statement1 pq:P1810 ?MeSH_Descriptor_Name .
            }
        OPTIONAL{
            ?WikiData_ID p:P6694 ?statement2 .
            ?statement2 ps:P6694 ?MeSH_Concept_ID .
            ?statement2 pq:P1810 ?MeSH_Concept_Name . }
        }' AS sparql, d

            CALL apoc.load.jsonParams(
                replace("https://query.wikidata.org/sparql?query=" + apoc.text.urlencode(sparql), "\n", ""),
                { Accept: "application/sparql-results+json" }, null )
        YIELD value

        UNWIND value['results']['bindings'] as row

        SET d.WikiData_ID = split(row['WikiData_ID']['value'],'/')[-1]

        FOREACH(ignoreMe IN CASE WHEN row['MeSH_Descriptor_Name'] IS NOT NULL THEN [1] ELSE [] END |
            MERGE (c:MeSH { MeSH_ID:row['MeSH_Descriptor_ID']['value'] ,
                            Type:"Descriptor", Name: row['MeSH_Descriptor_Name']['value']})
            MERGE (d)-[r:RELATED_MESH]->(c)
        )

        FOREACH(ignoreMe IN CASE WHEN row['MeSH_Concept_Name'] IS NOT NULL THEN [1] ELSE [] END |
            MERGE (cc:MeSH { MeSH_ID:row['MeSH_Concept_ID']['value'] ,
                            Type:"Concept", Name: row['MeSH_Concept_Name']['value']})
            MERGE (d)-[r2:RELATED_MESH]->(cc)
        )

        } IN TRANSACTIONS OF 10 rows
        """