Source code for CanGraph.GraphifyDrugBank.build_database

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# SPDX-FileCopyrightText: 2022 Pablo Marcos <software@loreak.org>
#
# SPDX-License-Identifier: MIT

"""
A python module that provides the necessary functions to transition the DrugBank database to graph format,
either from scratch importing all the nodes (as showcased in :obj:`CanGraph.GraphifyDrugBank.main`) or in a case-by-case basis,
to annotate existing metabolites (as showcased in :obj:`CanGraph.main`).
"""

# Import external modules necessary for the script
from alive_progress import alive_bar # A cute progress bar that shows the script is still running
import os, sys, shutil               # Vital modules to interact with the filesystem
from time import sleep               # A hack to avoid starving the system resources

# Import subscripts for the program
# This hack that allows us to de-duplicate the miscleaneous script in this less-used script
sys.path.append("../")
# .. NOTE::: Please beware that, if using this module by itself, you might need to copy "miscelaneous.py" into your path
# This is not the most elegant, but simplifies code maintenance, and this script shouldnt be used much so...
import miscelaneous as misc

[docs]def add_drugs(filename): """ Creates "Drug" nodes based on XML files obtained from the DrugBank website, adding some essential identifiers and external properties. .. seealso:: This way of working has been taken from `William Lyon's Blog <https://lyonwj.com/blog/grandstack-podcast-app-parsing-xml-neo4j-rss-episodes-playlists>`_ Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. NOTE:: Since Publications dont have any standard identificator, they are created using the "Title" """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS drugs UNWIND drugs AS drug WITH drug.type AS type, [X in drug._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in drug._children WHERE X._type = "name"][0]._text AS name, [X in drug._children WHERE X._type = "description"][0]._text AS description, [X in drug._children WHERE X._type = "cas-number"][0]._text AS cas_number, [X in drug._children WHERE X._type = "unii"][0]._text AS unii, [X in drug._children WHERE X._type = "state"][0]._text AS state, [X in drug._children WHERE X._type = "indication"][0]._text AS indication, [X in drug._children WHERE X._type = "pharmacodynamics"][0]._text AS pharmacodynamics, [X in drug._children WHERE X._type = "mechanism-of-action"][0]._text AS mechanism_of_action, [X in drug._children WHERE X._type = "toxicity"][0]._text AS toxicity, [X in drug._children WHERE X._type = "metabolism"][0]._text AS metabolism, [X in drug._children WHERE X._type = "absorption"][0]._text AS absorption, [X in drug._children WHERE X._type = "half-life"][0]._text AS half_life, [X in drug._children WHERE X._type = "route-of-elimination"][0]._text AS route_of_elimination, [X in drug._children WHERE X._type = "protein-binding"][0]._text AS protein_binding_info, [X in drug._children WHERE X._type = "volume-of-distribution"][0]._text AS volume_of_distribution, [X in drug._children WHERE X._type = "clearance"][0]._text AS clearance, [X in drug._children WHERE X._type = "fda-label"][0]._text AS fda_label, [X in drug._children WHERE X._type = "msds"][0]._text AS msds, [X in drug._children WHERE X._type = "synthesis-reference"][0]._text AS synthesis_reference, [X in drug._children WHERE X._type = "synonyms"] AS synonyms, [X in drug._children WHERE X._type = "drugbank-id" AND X.primary IS null] AS secondary_ids, [X in drug._children WHERE X._type = "groups"] AS groups, [X in drug._children WHERE X._type = "food-interactions"] AS food_interactions, [X in drug._children WHERE X._type = "affected-organisms"] AS affected_organisms MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }} ) FOREACH(ignoreMe IN CASE WHEN synthesis_reference IS NOT null THEN [1] ELSE [] END | FOREACH(ignoreMe IN CASE WHEN split(synthesis_reference, "\\"")[1] <> "" AND split(synthesis_reference, "\\"")[1] IS NOT null THEN [1] ELSE [] END | MERGE (p:Publication {{ Title:split(synthesis_reference, "\\"")[1] }}) SET p.Authors = split(synthesis_reference, "\\"")[0] SET p.Publication = "US Patent Office" SET p.Notes = split(split(synthesis_reference, "\\"")[2], ",")[0] SET p.Date = replace(split(split(synthesis_reference, "\\"")[2], ",")[1], "issued ", "") MERGE (d)-[r:CITED_IN]->(p) SET r.Type = "Synthesis" ) FOREACH(ignoreMe IN CASE WHEN split(split(synthesis_reference, ":")[1], ".")[0] <> "" AND split(split(synthesis_reference, ":")[1], ".")[0] IS NOT null THEN [1] ELSE [] END | MERGE (p:Publication {{ Title:split(split(synthesis_reference, ":")[1], ".")[0] }}) SET p.Authors = split(synthesis_reference, ":")[0] SET p.Publication = split(replace(synthesis_reference, split(synthesis_reference, ".")[0]+". ",""), ".")[0] SET p.Notes = split(replace(synthesis_reference, split(synthesis_reference, ".")[0]+". ",""), ".")[2] SET p.Date = split(split(replace(synthesis_reference, split(synthesis_reference, ".")[0]+". ",""), ".")[1],";")[0] SET p.Volume = split(split(synthesis_reference, ";")[1], "(")[0] SET p.Issue = split(split(synthesis_reference, "(")[1], ")")[0] SET p.Pages = split(split(synthesis_reference, ":")[-1], ".")[0] SET p.DOI = split(synthesis_reference, "doi:")[1] MERGE (d)-[r:CITED_IN]->(p) SET r.Type = "Synthesis" ) ) SET d.Name = name, d.Description = description, d.CAS_Number = cas_number, d.UNII = unii, d.State = state, d.Indication = indication, d.Pharmacodynamics = pharmacodynamics, d.Mechanism_of_Action = mechanism_of_action, d.Toxicity = toxicity, d.Metabolism = metabolism, d.Absorption = absorption, d.Half_Life = half_life, d.Route_of_Elimination = route_of_elimination, d.Protein_Binding_Info = protein_binding_info, d.Volume_of_Distribution = volume_of_distribution, d.Clearance = clearance, d.FDA_Label = fda_label, d.Safety_Data_Sheet = msds WITH food_interactions, synonyms, groups, affected_organisms, secondary_ids, d SET d.Synonyms = "", d.Food_Interactions = "", d.Groups = "", d.Alternative_DrugBank_IDs = "", d.Affected_Organisms = "" FOREACH(element in food_interactions| FOREACH(interaction in element._children| SET d.Food_Interactions = interaction._text + "," + d.Food_Interactions ) ) FOREACH(element in groups| FOREACH(group in element._children| SET d.Groups = group._text + "," + d.Groups ) ) FOREACH(element in synonyms| FOREACH(synonym in element._children| SET d.Synonyms = synonym._text + "," + d.Synonyms ) ) FOREACH(element in affected_organisms| FOREACH(organism in element._children| SET d.Affected_Organisms = organism._text + "," + d.Affected_Organisms ) ) FOREACH(element in secondary_ids| SET d.Alternative_DrugBank_IDs = element._text + "," + d.Alternative_DrugBank_IDs ) SET d.Groups = substring(d.Groups, 0, size(d.Groups) -1 ) SET d.Synonyms = substring(d.Synonyms, 0, size(d.Synonyms) -1 ) SET d.Food_Interactions = substring(d.Food_Interactions, 0, size(d.Food_Interactions) -1 ) SET d.Affected_Organisms = substring(d.Affected_Organisms, 0, size(d.Affected_Organisms) -1 ) """)
[docs]def add_general_references(filename): """ Creates "Publication" nodes based on XML files obtained from the DrugBank website. Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. NOTE:: Since not all nodes present a "PubMed_ID" field (which would be ideal to uniquely-identify Publications, as the "Text" field is way more prone to typos/errors), nodes will be created using the "Authors" field. This means some duplicates might exist, which should be accounted for. """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS drugs UNWIND drugs AS drug WITH [X in drug._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in drug._children WHERE X._type = "general-references"] AS general_references MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) WITH general_references, d UNWIND general_references AS general_reference WITH general_reference, d UNWIND general_reference["_children"] AS my_references WITH my_references, d UNWIND my_references["_children"] AS my_reference WITH [X in my_reference._children WHERE X._type = "citation"][0]._text AS citation, [X in my_reference._children WHERE X._type = "ref-id"][0]._text AS ref_id, [X in my_reference._children WHERE X._type = "pubmed-id"][0]._text AS pubmed_id, d FOREACH(ignoreMe IN CASE WHEN citation IS NOT null THEN [1] ELSE [] END | FOREACH(ignoreMe IN CASE WHEN split(replace(citation, split(citation, ":")[0]+": ", ""), ".")[0] <> "" THEN [1] ELSE [] END | MERGE (p:Publication {{Ref_ID:ref_id}}) SET p.Authors = split(citation, ":")[0] SET p.Title = split(replace(citation, split(citation, ":")[0]+": ", ""), ".")[0] SET p.Publication = split(replace(citation, split(citation, ".")[0]+". ",""), ".")[0] SET p.Notes = split(replace(citation, split(citation, ".")[0]+". ",""), ".")[2] SET p.Date = split(split(replace(citation, split(citation, ".")[0]+". ",""), ".")[1],";")[0] SET p.Volume = split(split(citation, ";")[1], "(")[0] SET p.Issue = split(split(citation, "(")[1], ")")[0] SET p.Pages = split(split(citation, ":")[-1], ".")[0] SET p.PubMed_ID = pubmed_id MERGE (d)-[r:CITED_IN]->(p) ) ) """)
[docs]def add_taxonomy(filename): """ Creates "Taxonomy" nodes based on XML files obtained from the DrugBank website. These represent the "kind" of Drug we are dealing with (Family, etc) Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. NOTE:: It only creates relationships in the Kingdom -> Super Class -> Class -> Subclass direction, and from any node -> Drug. This means that, if any member of the Kingdom -> Super Class -> Class -> Subclass is absent, the line will be broken; hopefully in that case a new Drug will come in to rescue and settle the relation! .. WARNING:: Some nodes without labels might be created if names are null: This has to be accounted for later on in the process """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS drugs UNWIND drugs AS drug WITH [X in drug._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in drug._children WHERE X._type = "classification"] AS classification MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) WITH classification, d UNWIND classification as my_nodes WITH [X IN my_nodes._children WHERE X._type = "description"][0]._text AS description, [X IN my_nodes._children WHERE X._type = "direct-parent"][0]._text AS direct_parent, [X IN my_nodes._children WHERE X._type = "kingdom"][0]._text AS kingdom, [X IN my_nodes._children WHERE X._type = "superclass"][0]._text AS super_class, [X IN my_nodes._children WHERE X._type = "class"][0]._text AS class, [X IN my_nodes._children WHERE X._type = "subclass"][0]._text AS sub_class, [X IN my_nodes._children WHERE X._type = "alternative-parent"] AS alternative_parents, [X IN my_nodes._children WHERE X._type = "substituents"] AS substituents, d as m // First, we create the Taxonomy nodes independently FOREACH(ignoreMe IN CASE WHEN kingdom IS NOT null THEN [1] ELSE [] END | MERGE (k:Taxonomy {{Type:"Kingdom", Name:kingdom}}) ) FOREACH(ignoreMe IN CASE WHEN super_class IS NOT null THEN [1] ELSE [] END | MERGE (sp:Taxonomy {{Type:"Super Class", Name:super_class}}) ) FOREACH(ignoreMe IN CASE WHEN class IS NOT null THEN [1] ELSE [] END | MERGE (c:Taxonomy {{Type:"Class", Name:class}}) ) FOREACH(ignoreMe IN CASE WHEN sub_class IS NOT null THEN [1] ELSE [] END | MERGE (sb:Taxonomy {{Type:"Sub Class", Name:sub_class}}) ) FOREACH(ignoreMe IN CASE WHEN direct_parent IS NOT null THEN [1] ELSE [] END | MERGE (dp:Taxonomy {{Name:direct_parent}}) MERGE (m)-[:PART_OF_CLADE]->(dp) ) // Then, we add a hierarchy connecting the nodes as much as possible between them FOREACH(ignoreMe IN CASE WHEN kingdom IS NOT null AND super_class IS NOT null THEN [1] ELSE [] END | MERGE (k:Taxonomy {{ Type:"Kingdom", Name:kingdom }}) MERGE (sp:Taxonomy {{ Type:"Super Class", Name:super_class }}) MERGE (k)-[:PART_OF_CLADE]->(sp) ) FOREACH(ignoreMe IN CASE WHEN class IS NOT null AND super_class IS NOT null THEN [1] ELSE [] END | MERGE (c:Taxonomy {{ Type:"Class", Name:class }}) MERGE (sp:Taxonomy {{ Type:"Super Class", Name:super_class }}) MERGE (sp)-[:PART_OF_CLADE]->(c) ) FOREACH(ignoreMe IN CASE WHEN sub_class IS NOT null AND class IS NOT null THEN [1] ELSE [] END | MERGE (c:Taxonomy {{ Type:"Class", Name:class }}) MERGE (sb:Taxonomy {{ Type:"Sub Class", Name:sub_class }}) MERGE (sb)-[:PART_OF_CLADE]->(c) ) // And we connect the hierarchy to the main node just once FOREACH(ignoreMe IN CASE WHEN sub_class IS NOT null THEN [1] ELSE [] END | MERGE (ta:Taxonomy {{ Name:sub_class }}) MERGE (m)-[:PART_OF_CLADE]->(ta) ) FOREACH(ignoreMe IN CASE WHEN class IS NOT null AND sub_class IS null THEN [1] ELSE [] END | MERGE (ta:Taxonomy {{ Name:class }}) MERGE (m)-[:PART_OF_CLADE]->(ta) ) FOREACH(ignoreMe IN CASE WHEN super_class IS NOT null AND sub_class IS null AND class IS null THEN [1] ELSE [] END | MERGE (ta:Taxonomy {{ Name:super_class }}) MERGE (m)-[:PART_OF_CLADE]->(ta) ) FOREACH(ignoreMe IN CASE WHEN kingdom IS NOT null AND sub_class IS null AND class IS null AND super_class IS null THEN [1] ELSE [] END | MERGE (ta:Taxonomy {{ Name:kingdom }}) MERGE (m)-[:PART_OF_CLADE]->(ta) ) // We add the alternative_parents in the appropriate format FOREACH(element in alternative_parents| MERGE (t:Taxonomy {{Name:element._text}}) MERGE (m)-[:PART_OF_CLADE]->(t) ) // If any Taxonomy is left without a connection, we connect it to the main graph // Beware: if any disconnected taxonomy is left from before, this could lead to errors WITH m, alternative_parents MATCH (tt:Taxonomy) WHERE NOT (tt)--() MERGE (m)-[:PART_OF_CLADE]->(tt) """)
[docs]def add_products(filename): """ Creates "Product" nodes based on XML files obtained from the DrugBank website. These are the individal medicaments that have been approved (or not) by the FDA Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. WARNING:: Using CREATE means that duplicates will appear; unfortunately, I couldnt any unique_id field to use as ID when MERGEing the nodes. This should be accounted for. """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS drugs UNWIND drugs AS drug WITH [X in drug._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in drug._children WHERE X._type = "products"] AS products MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) WITH products, d UNWIND products AS product WITH product, d UNWIND product["_children"] AS my_product WITH [X in my_product._children WHERE X._type = "name"][0]._text AS name, [X in my_product._children WHERE X._type = "labeller"][0]._text AS labeller, [X in my_product._children WHERE X._type = "ndc-id"][0]._text AS ndc_id, [X in my_product._children WHERE X._type = "ndc-product-code"][0]._text AS ndc_product_code, [X in my_product._children WHERE X._type = "dpd-id"][0]._text AS dpd_id, [X in my_product._children WHERE X._type = "ema-product-code"][0]._text AS ema_product_code, [X in my_product._children WHERE X._type = "ema-ma-number"][0]._text AS ema_ma_number, [X in my_product._children WHERE X._type = "started-marketing-on"][0]._text AS started_marketing_on, [X in my_product._children WHERE X._type = "ended-marketing-on"][0]._text AS ended_marketing_on, [X in my_product._children WHERE X._type = "dosage-form"][0]._text AS dosage_form, [X in my_product._children WHERE X._type = "strength"][0]._text AS strength, [X in my_product._children WHERE X._type = "route"][0]._text AS route, [X in my_product._children WHERE X._type = "fda-application-number"][0]._text AS fda_application_number, [X in my_product._children WHERE X._type = "generic"][0]._text AS generic, [X in my_product._children WHERE X._type = "over-the-counter"][0]._text AS over_the_counter, [X in my_product._children WHERE X._type = "approved"][0]._text AS approved, [X in my_product._children WHERE X._type = "country"][0]._text AS country, [X in my_product._children WHERE X._type = "source"][0]._text AS source, d CREATE (p:Product) SET p.Labeller = labeller, p.NDC_ID = ndc_id, p.NDC_Product_Code = ndc_product_code, p.DPD_ID = dpd_id, p.EMA_Product_Code = ema_product_code, p.EMA_MA_Number = ema_ma_number, p.Started_Marketing_On = started_marketing_on, p.Ended_Marketing_On = ended_marketing_on, p.Dosage_Form = dosage_form, p.Strength = strength, p.Route = route, p.FDA_Application_Number = fda_application_number, p.Generic = generic, p.Over_the_Counter = over_the_counter, p.Approved = approved, p.Country = country, p.Source = source, p.Name = name MERGE (d)-[r:PART_OF_PRODUCT]->(p) """)
[docs]def add_mixtures(filename): """ Creates "Mixture" nodes based on XML files obtained from the DrugBank website. These are the mixtures of existing Drugs, which may or may not be on the market. Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. NOTE:: This doesn't seem of much use, but has been added nonetheless just in case. """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS drugs UNWIND drugs AS drug WITH [X in drug._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in drug._children WHERE X._type = "mixtures"] AS mixtures MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) WITH mixtures, d UNWIND mixtures as mixture WITH mixture, d UNWIND mixture["_children"] as my_mixture WITH [X in my_mixture._children WHERE X._type = "name"][0]._text AS name, [X in my_mixture._children WHERE X._type = "ingredients"][0]._text AS ingredient, d MERGE (m:Product {{ Name:name, Ingredient:ingredient }}) MERGE (d)-[r:PART_OF_PRODUCT]->(m) """)
[docs]def add_categories(filename): """ Creates "Category" nodes based on XML files obtained from the DrugBank website. These represent the different MeSH IDs a Drug can be related with Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. NOTE:: Each category seems to have an associated MeSH ID. Maybe could rename nodes as MeSH? """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS drugs UNWIND drugs AS drug WITH [X in drug._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in drug._children WHERE X._type = "categories"] AS categories MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) WITH categories, d UNWIND categories as category WITH category, d UNWIND category["_children"] as my_category WITH [X in my_category._children WHERE X._type = "category"][0]._text AS category, [X in my_category._children WHERE X._type = "mesh-id"][0]._text AS MESH_ID, d MERGE (c:MeSH {{ Name:category }}) FOREACH(ignoreMe IN CASE WHEN substring(d.Name, 0, 1) = "M" THEN [1] ELSE [] END | SET c.Type = "Concept", c.MeSH_ID = MESH_ID ) FOREACH(ignoreMe IN CASE WHEN substring(d.Name, 0, 1) = "D" THEN [1] ELSE [] END | SET c.Type = "Descriptor", c.MeSH_ID = MESH_ID ) SET c.MeSH_ID = MESH_ID MERGE (d)-[r:RELATED_MESH]->(c) """)
[docs]def add_manufacturers(filename): """ Creates "Company" nodes based on XML files obtained from the DrugBank website. These represent the different Companies that manufacture a Drug's compound (not just package it) Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS drugs UNWIND drugs AS drug WITH [X in drug._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in drug._children WHERE X._type = "manufacturers"] AS manufacturers MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) WITH manufacturers, d UNWIND manufacturers as manufacturer WITH manufacturer, d UNWIND manufacturer["_children"] as their_manufacturer WITH their_manufacturer.generic as generic, their_manufacturer.url as url, apoc.text.capitalizeAll(their_manufacturer._text) as name, d MERGE (c:Company {{ Name:name, Manufacturer:"True" }}) SET c.Generic = generic, c.Manufacturer = "True" FOREACH(ignoreMe IN CASE WHEN NOT url = "" THEN [1] ELSE [] END | SET c.URL = url ) MERGE (d)-[r:MANUFACTURED_BY]->(c) """)
[docs]def add_packagers(filename): """ Creates "Company" nodes based on XML files obtained from the DrugBank website. These represent the different Companies that package a Drug's compounds (not the ones that manufacture them) Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS drugs UNWIND drugs AS drug WITH [X in drug._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in drug._children WHERE X._type = "packagers"] AS packagers MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) WITH packagers, d UNWIND packagers as packager WITH packager, d UNWIND packager["_children"] as their_packager WITH [X in their_packager._children WHERE X._type = "name"][0]._text AS name, [X in their_packager._children WHERE X._type = "url"][0]._text AS url, d MERGE (c:Company {{ Name:name }}) SET c.Packager = "True" FOREACH(ignoreMe IN CASE WHEN NOT url = "" THEN [1] ELSE [] END | SET c.URL = url ) MERGE (d)-[r:MANUFACTURED_BY]->(c) """)
[docs]def add_dosages(filename): """ Creates "Dosage" nodes based on XML files obtained from the DrugBank website. These represent the different Dosages that a Drug should be administered at. Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. WARNING:: Using CREATE might generate duplicate nodes, but there was no unique characteristic to MERGE nodes into. """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS drugs UNWIND drugs AS drug WITH [X in drug._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in drug._children WHERE X._type = "dosages"] AS dosages MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) WITH dosages, d UNWIND dosages as dosage WITH dosage, d UNWIND dosage["_children"] as my_dosage WITH [X in my_dosage._children WHERE X._type = "form"][0]._text AS form, [X in my_dosage._children WHERE X._type = "route"][0]._text AS route, [X in my_dosage._children WHERE X._type = "strength"][0]._text AS strength, d CREATE (do:Dosage {{ Route:route }}) SET do.Strength = strength, do.Form = form MERGE (d)-[r:DOSED_AS]->(do) """)
[docs]def add_atc_codes(filename): """ Creates "ATC" nodes based on XML files obtained from the DrugBank website. These represent the different ATC codes a Drug can be related with (including an small taxonomy) Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS drugs UNWIND drugs AS drug WITH [X in drug._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in drug._children WHERE X._type = "atc-codes"] AS atc_codes WITH atc_codes, Primary_Drugbank_ID UNWIND atc_codes as atc_code WITH atc_code, Primary_Drugbank_ID UNWIND atc_code._children as my_atc WITH my_atc.code AS primary_atc, [X in my_atc._children WHERE X._type = "level"] AS levels, Primary_Drugbank_ID UNWIND levels as level WITH primary_atc, level.code as atc_subcode, level._text as atc_text, Primary_Drugbank_ID MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) MERGE (pri:ATC {{ Code:primary_atc }}) MERGE (sec:ATC {{ Code:atc_subcode }}) SET sec.Name = atc_text, pri.Type = "Primary", sec.Type = "Secondary" MERGE (d)-[r:RELATED_ATC]->(pri) MERGE (pri)-[r2:RELATED_ATC]->(sec) """)
[docs]def add_drug_interactions(filename): """ Creates ```(d)-[r:RELATED_WITH_DRUG]-(dd)``` interactions between "Drug" nodes, whether they existed before or not. These are intentionally non-directional, as they should be related with each other. Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS metabolites UNWIND metabolites AS metabolite WITH [X in metabolite._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in metabolite._children WHERE X._type = "drug-interactions"] AS drug_interactions UNWIND drug_interactions AS drug_interaction WITH drug_interaction, Primary_Drugbank_ID UNWIND drug_interaction["_children"] AS my_interaction WITH [X in my_interaction._children WHERE X._type = "drugbank-id"][0]._text AS drugbank_id, [X in my_interaction._children WHERE X._type = "name"][0]._text AS name, [X in my_interaction._children WHERE X._type = "description"][0]._text AS description, Primary_Drugbank_ID MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) MERGE (dd:Drug {{ DrugBank_ID:drugbank_id }}) ON CREATE SET dd.Name = name MERGE (d)-[r:INTERACTS_WITH]-(dd) SET r.Description = description """)
[docs]def add_sequences(filename): """ Creates "Sequence" nodes based on XML files obtained from the DrugBank website. These represent the AminoAcid sequence of Drugs that are of a peptidic nature. Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. TODO:: In some other parts of the script, sequences are being added as properties on Protein nodes. A common format should be set. """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS metabolites UNWIND metabolites AS metabolite WITH [X in metabolite._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in metabolite._children WHERE X._type = "sequences"] AS sequences UNWIND sequences AS sequence WITH sequence, Primary_Drugbank_ID UNWIND sequence["_children"] AS my_sequence WITH my_sequence.format AS format, my_sequence._text as text, Primary_Drugbank_ID MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) MERGE (s:Sequence {{ Sequence:text }}) SET s.Format = format SET s.Type = "PROT" MERGE (d)-[r:SEQUENCED_AS]->(s) FOREACH(ignoreMe IN CASE WHEN text IS NOT null THEN [1] ELSE [] END | SET d:Protein ) """)
[docs]def add_experimental_properties(filename): """ Adds some experimental properties to existing "Drug" nodes based on XML files obtained from the DrugBank website. Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS metabolites UNWIND metabolites AS metabolite WITH [X in metabolite._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in metabolite._children WHERE X._type = "experimental-properties"] AS experimental_properties MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) WITH experimental_properties, d UNWIND experimental_properties as experimental_property WITH experimental_property, d UNWIND experimental_property["_children"] AS my_property WITH my_property, d WITH [X in my_property._children WHERE X._type = "kind"][0]._text AS kind, [X in my_property._children WHERE X._type = "value"][0]._text AS value, d WITH apoc.map.fromLists(collect(kind), collect(value)) AS dict, d SET d.Average_Mass = dict["Molecular Weight"], d.Isoelectric_Point = dict["Isoelectric Point"], d.Water_Solubility = dict["Water Solubility"], d.pKa = dict["pKa"], d.Hydrophobicity = dict.Hydrophobicity, d.Formula = dict["Molecular Formula"], d.Melting_Point = dict["Melting Point"], d.logP = dict["logP"] """)
[docs]def add_external_identifiers(filename): """ Adds some external identifiers to existing "Drug" nodes based on XML files obtained from the DrugBank website. Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. NOTE:: These also adds a "Protein" label to any "Drug"-labeled nodes which have a "UniProtKB"-ID among their properties. NOTE that this can look confusing in the DB Schema!!! """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS metabolites UNWIND metabolites AS metabolite WITH [X in metabolite._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in metabolite._children WHERE X._type = "external-identifiers"] AS external_identifiers WITH external_identifiers, Primary_Drugbank_ID UNWIND external_identifiers as external_identifier WITH external_identifier, Primary_Drugbank_ID UNWIND external_identifier["_children"] AS my_identifier WITH [X in my_identifier._children WHERE X._type = "resource"][0]._text AS resource, [X in my_identifier._children WHERE X._type = "identifier"][0]._text AS identifier, Primary_Drugbank_ID MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) WITH apoc.map.fromLists(collect(resource), collect(identifier)) AS dict, d SET d.Therapeutic_Targets_Database = dict["Therapeutic Targets Database"], d.BindingDB = dict.BindingDB, d.UniProt_ID = dict["UniProtKB"], d.PubChem_ID= dict["PubChem Compound"], d.WikiPedia_Article = dict.Wikipedia, d.ChEMBL_ID = dict.ChEMBL, d.Genbank_Protein_ID = dict.GenBank, d.DPD_ID = dict["Drugs Product Database (DPD)"], d.RxCUI = dict["RxCUI"], d.PharmGKB = dict["PharmGKB"], d.ChemSpider_ID = dict.ChemSpider, d.KEGG_ID = dict["KEGG Drug"]+","+dict["KEGG Compound"] FOREACH(ignoreMe IN CASE WHEN d.UniProt_ID IS NOT null THEN [1] ELSE [] END | SET d:Protein ) """)
[docs]def add_external_equivalents(filename): """ Adds some external equivalents to existing "Drug" nodes based on XML files obtained from the DrugBank website. This should be "exact matches" of the Drug in other databases. Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. NOTE:: The main reason to add them as "External-Equivalents" is because I felt these IDs where of not much use (and are thus easier to eliminate due to their common label) """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS metabolites UNWIND metabolites AS metabolite WITH [X in metabolite._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in metabolite._children WHERE X._type = "external-links"] AS external_links WITH external_links, Primary_Drugbank_ID UNWIND external_links as external_link WITH external_link, Primary_Drugbank_ID UNWIND external_link["_children"] AS my_identifier WITH [X in my_identifier._children WHERE X._type = "resource"][0]._text AS resource, [X in my_identifier._children WHERE X._type = "url"][0]._text AS url, Primary_Drugbank_ID MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) MERGE (ee:ExternalEquivalent {{ URL:url }}) SET ee.Resource_Name = resource MERGE (d)-[r:EQUALS]-(ee) """)
[docs]def add_pathways_and_relations(filename): """ Adds "Pathway" nodes based on XML files obtained from the DrugBank website. It also adds some relations between Drugs and Proteins (which, remember, could even be the same kind of node) It is also able to tag both a Protein's and a Drug¡s relation with a given Pathway In general, a Pathway involves a collection of Enzymes, Drugs and Proteins, with a SMPDB_ID (cool for interconnexion!) Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. WARNING:: This function uses a "double UNWIND" clause, which means that we are only representing <pathways> tags with <enzymes> tags inside. Fortunately, this seems to seldom not happen, so it should represent no problem. """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS metabolites UNWIND metabolites AS metabolite WITH [X in metabolite._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in metabolite._children WHERE X._type = "pathways"] AS pathways WITH pathways, Primary_Drugbank_ID UNWIND pathways AS pathway WITH pathway, Primary_Drugbank_ID UNWIND pathway["_children"] AS my_pathway WITH [X IN my_pathway._children WHERE X._type = "smpdb-id"][0]._text AS smpdb_id, [X IN my_pathway._children WHERE X._type = "name"][0]._text AS pathway_name, [X IN my_pathway._children WHERE X._type = "category"][0]._text AS category, [X IN my_pathway._children WHERE X._type = "drugs"] AS drugs, [X IN my_pathway._children WHERE X._type = "enzymes"] AS enzymes, Primary_Drugbank_ID UNWIND drugs as drug UNWIND enzymes as enzyme WITH drug, enzyme, smpdb_id, pathway_name, category, Primary_Drugbank_ID UNWIND drug._children as my_drug UNWIND enzyme._children as my_enzyme WITH my_enzyme._text AS UniProt_ID, [X IN my_drug._children WHERE X._type = "drugbank-id"][0]._text AS DrugBank_ID, [X IN my_drug._children WHERE X._type = "name"][0]._text AS drug_name, smpdb_id, pathway_name, category, Primary_Drugbank_ID MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) MERGE (s:Pathway {{ SMPDB_ID:smpdb_id }}) ON CREATE SET s.Name = pathway_name, s.Category = category MERGE (dd:Drug {{ DrugBank_ID:DrugBank_ID }}) ON CREATE SET dd.Name = drug_name MERGE (p:Protein {{ UniProt_ID:UniProt_ID }}) MERGE (d)-[r:PART_OF_PATHWAY]->(s) MERGE (dd)-[r2:PART_OF_PATHWAY]->(s) MERGE (p)-[r3:PART_OF_PATHWAY]->(s) MERGE (d)-[r4:INTERACTS_WITH]-(dd) MERGE (d)-[r5:INTERACTS_WITH]-(p) """)
[docs]def add_targets_enzymes_carriers_and_transporters(filename, tag_name): """ A *REALLY HUGE* function. It takes a filename and a tag_name, and gets info and creates "Protein" nodes with tag_name set as their role. It also adds a bunch of additional info, such as Publications, Targets, Actions, GO_IDs, PFAMs and/or some External IDs Args: tx (neo4j.Session): The session under which the driver is running filename (str): The name of the XML file that is being imported tag_name (str): The type of Protein node you want to import; it must be one of ["enzymes", "carriers", "transporters"] It is recommended that you run this function thrice, once for each type of protein Returns: neo4j.Result: A Neo4J connexion to the database that modifies it according to the CYPHER statement contained in the function. .. WARNING:: We are using a bunch of concatenated UNWINDs, which force the existance of all elements in the UNWIND chain. This might remove some elements, but this is a HUUUUUGE database, and, to be honest, most things seem to almost always be present. An example is References and Polypeptides; Since there seem to be more References that Polypeptides, we try to UNWIND those first. The same can be said on the rest of UNWINDS: as we have external-id >>>>>>>>>> go-classifier >>>> pfam >> synonyms (in order of *occurrence*. not *number* of tags) , we UNWIND in that order to mitigate data loss .. NOTE:: To fix repetitions in properties such as Actions or Synonyms (caused by the HUGE number of UNWINDs), we tried lots of different strategies, finally coming up with SET p.Synonyms = replace(p.Synonyms, synonym._text + ",", ""). This is cool! But means there will always be a trailing comma (removing it was not easy in this same transaction, though it could (TODO?) be done at the end. .. NOTE:: <tag_name> .. TODO:: Investigate https://stackoverflow.com/questions/14026217/using-neo4j-distinct-and-order-by-on-different-properties """ return (f""" CALL apoc.load.xml("{filename}") YIELD value WITH [x in value._children WHERE x._type = "drug"] AS metabolites UNWIND metabolites AS metabolite WITH [X in metabolite._children WHERE X._type = "drugbank-id" AND X.primary = "true"][0]._text AS Primary_Drugbank_ID, [X in metabolite._children WHERE X._type = "{tag_name}"] AS targets WITH targets, Primary_Drugbank_ID UNWIND targets AS target WITH target, Primary_Drugbank_ID UNWIND target["_children"] AS my_target WITH my_target.position AS position, [X IN my_target._children WHERE X._type = "id"][0]._text AS target_id, [X IN my_target._children WHERE X._type = "name"][0]._text AS name, [X IN my_target._children WHERE X._type = "organism"][0]._text AS target_organism, [X IN my_target._children WHERE X._type = "known-action"][0]._text AS known_action, [X IN my_target._children WHERE X._type = "actions"] AS actions, [X IN my_target._children WHERE X._type = "references"] AS references, [X IN my_target._children WHERE X._type = "polypeptide"] AS polypeptides, Primary_Drugbank_ID UNWIND references AS reference UNWIND reference._children AS this_reference UNWIND this_reference._children AS my_reference WITH my_reference._type AS ref_type, [X in my_reference._children WHERE X._type = "citation"][0]._text AS citation, [X in my_reference._children WHERE X._type = "ref-id"][0]._text AS ref_id, [X in my_reference._children WHERE X._type = "pubmed-id"][0]._text AS pubmed_id, Primary_Drugbank_ID, actions, polypeptides, target_id, name, target_organism, known_action, position MERGE (d:Drug {{ DrugBank_ID:Primary_Drugbank_ID }}) FOREACH(ignoreMe IN CASE WHEN citation IS NOT null THEN [1] ELSE [] END | FOREACH(ignoreMe IN CASE WHEN split(replace(citation, split(citation, ":")[0]+": ", ""), ".")[0] <> "" THEN [1] ELSE [] END | MERGE (pu:Publication {{ Ref_ID:ref_id }}) SET pu.Authors = split(citation, ":")[0] SET pu.Title = split(replace(citation, split(citation, ":")[0]+": ", ""), ".")[0] SET pu.Publication = split(replace(citation, split(citation, ".")[0]+". ",""), ".")[0] SET pu.Notes = split(replace(citation, split(citation, ".")[0]+". ",""), ".")[2] SET pu.Date = split(split(replace(citation, split(citation, ".")[0]+". ",""), ".")[1],";")[0] SET pu.Volume = split(split(citation, ";")[1], "(")[0] SET pu.Issue = split(split(citation, "(")[1], ")")[0] SET pu.Pages = split(split(citation, ":")[-1], ".")[0] SET pu.PubMed_ID = pubmed_id, pu.Type = ref_type MERGE (d)-[r2:CITED_IN]->(pu) ) ) WITH polypeptides, target_id, name, target_organism, known_action, actions, position, d UNWIND polypeptides AS polypeptide WITH polypeptide.id AS UniProt_ID, polypeptide.source AS polypeptide_source, [X in polypeptide._children WHERE X._type = "name"][0]._text AS polypeptide_name, [X in polypeptide._children WHERE X._type = "general-function"][0]._text AS general_function, [X in polypeptide._children WHERE X._type = "specific-function"][0]._text AS specific_function, [X in polypeptide._children WHERE X._type = "gene-name"][0]._text AS gene_name, [X in polypeptide._children WHERE X._type = "locus"][0]._text AS locus, [X in polypeptide._children WHERE X._type = "transmembrane-regions"][0]._text AS transmembrane_regions, [X in polypeptide._children WHERE X._type = "signal-regions"][0]._text AS signal_regions, [X in polypeptide._children WHERE X._type = "theoretical-pi"][0]._text AS theoretical_pi, [X in polypeptide._children WHERE X._type = "molecular-weight"][0]._text AS molecular_weight, [X in polypeptide._children WHERE X._type = "chromosome-location"][0]._text AS chromosome_location, [X in polypeptide._children WHERE X._type = "cellular-location"] AS cellular_location, [X in polypeptide._children WHERE X._type = "organism"][0]._text AS polypeptide_organism, [X in polypeptide._children WHERE X._type = "organism"][0]["ncbi-taxonomy-id"] AS ncbi_taxonomy_id, [X in polypeptide._children WHERE X._type = "amino-acid-sequence"][0]._text AS amino_acid_sequence, [X in polypeptide._children WHERE X._type = "amino-acid-sequence"][0].format AS amino_acid_sequence_format, [X in polypeptide._children WHERE X._type = "gene-sequence"][0]._text AS gene_sequence, [X in polypeptide._children WHERE X._type = "gene-sequence"][0].format AS gene_sequence_format, [X in polypeptide._children WHERE X._type = "go-classifiers"] AS go_classifiers, [X in polypeptide._children WHERE X._type = "synonyms"] AS synonyms, [X in polypeptide._children WHERE X._type = "pfams"] AS pfams, [X in polypeptide._children WHERE X._type = "external-identifiers"] AS external_identifiers, target_id, name, target_organism, known_action, actions, position, d MERGE (p:Protein {{ UniProt_ID:UniProt_ID }}) SET p.Name = polypeptide_name, p.Function = general_function, p.Specific_Function = specific_function, p.Gene_Name = gene_name, p.Locus = locus, p.Transmembrane_Regions = transmembrane_regions, p.Signal_Regions = signal_regions, p.Theoretical_PI = theoretical_pi, p.Average_Mass = molecular_weight, p.Organism = polypeptide_organism, p.Target_Position = position, p.Source = polypeptide_source, p.Target_ID = target_id, p.Taget_Name = name, p.Known_Action = known_action, p.Target_Organism = target_organism SET p.Function = "{tag_name}" MERGE (d)-[r:TARGETS]->(p) FOREACH(ignoreMe IN CASE WHEN amino_acid_sequence IS NOT null THEN [1] ELSE [] END | MERGE (se:Sequence {{ Sequence:amino_acid_sequence }} ) SET se.Format = amino_acid_sequence_format, se.Type="DNA", se.UniProt_ID = UniProt_ID, se.Chromosome_Location = chromosome_location MERGE (p)-[r:SEQUENCED_AS]->(se) ) FOREACH(ignoreMe IN CASE WHEN gene_sequence IS NOT null THEN [1] ELSE [] END | MERGE (se:Sequence {{ Sequence:gene_sequence }} ) SET se.Format = gene_sequence_format, se.Type="DNA", se.UniProt_ID = UniProt_ID, se.Chromosome_Location = chromosome_location MERGE (p)-[r:SEQUENCED_AS]->(se) ) SET d.Actions = "" FOREACH(element in actions| FOREACH(action in element._children| SET d.Actions = replace(d.Actions, action._text + ",", "") SET d.Actions = action._text + "," + d.Actions ) ) FOREACH(location IN cellular_location | MERGE (c:CelularLocation) SET c.Name = location._text MERGE (p)-[r:LOCATED_INSIDE_CELL]->(c) ) WITH external_identifiers, d, p, go_classifiers, pfams, synonyms UNWIND external_identifiers AS external_identifier UNWIND external_identifier._children AS my_identifiers WITH [X in my_identifiers._children WHERE X._type = "resource"][0]._text AS resource, [X in my_identifiers._children WHERE X._type = "identifier"][0]._text AS identifier, d, p, go_classifiers, pfams, synonyms WITH apoc.map.fromLists(collect(resource), collect(identifier)) AS dict, d, p, go_classifiers, pfams, synonyms SET d.IUPHAR_ID = dict["IUPHAR"], d.Guide_to_Pharmacology_ID = dict["Guide to Pharmacology"], d.GenAtlas_ID = dict["GenAtlas"], d.Genbank_Protein_ID = dict["GenBank Protein Database"], d.UniProt_ID = dict["UniProtKB"], d.HGNC_ID = dict["HUGO Gene Nomenclature Committee (HGNC)"], d.GenBank_Gene_ID = dict["GenBank Gene Database"] WITH go_classifiers, pfams, d, p, synonyms UNWIND go_classifiers AS go_classifier UNWIND pfams AS pfam WITH go_classifier, pfam, d, p, synonyms UNWIND go_classifier["_children"] AS my_go UNWIND pfam["_children"] AS my_pfam WITH [X in my_go._children WHERE X._type = "category"][0]._text AS category, [X in my_go._children WHERE X._type = "description"][0]._text AS description, [X in my_pfam._children WHERE X._type = "identifier"][0]._text AS identifier, [X in my_pfam._children WHERE X._type = "name"][0]._text AS name, synonyms, d, p MERGE (g:GeneOntology {{ Description:description }}) SET g.Category = category MERGE (pf:PFam {{ PFAM_ID:identifier }}) SET pf.Name = name MERGE (p)-[r:PART_OF_PFAM]->(pf) MERGE (p)-[r2:PART_OF_GENE_ONTOLOGY]-(g) SET p.Synonyms = "" FOREACH(element in synonyms| FOREACH(synonym in element._children| SET p.Synonyms = replace(p.Synonyms, synonym._text + ",", "") SET p.Synonyms = synonym._text + "," + p.Synonyms ) ) """)
[docs]def build_from_file(newfile, driver): """ A function able to build a portion of the DrugBank database in graph format, provided that one XML is supplied to it. This can either be the ```full_database.xml``` file that you can get in DrugBank's website, or a splitted version of it, with just one item per file (which is recommended due to memory limitations) Args: newfile (str): The path of the XML file to import driver (neo4j.Driver): Neo4J's Bolt Driver currently in use Returns: This function modifies the Neo4J Database as desired, but does not produce any particular return. """ misc.manage_transaction(add_drugs(newfile), driver) misc.manage_transaction(add_general_references(newfile), driver) misc.manage_transaction(add_taxonomy(newfile), driver) misc.manage_transaction(add_products(newfile), driver) misc.manage_transaction(add_mixtures(newfile), driver) misc.manage_transaction(add_categories(newfile), driver) misc.manage_transaction(add_manufacturers(newfile), driver) misc.manage_transaction(add_packagers(newfile), driver) misc.manage_transaction(add_dosages(newfile), driver) misc.manage_transaction(add_atc_codes(newfile), driver) misc.manage_transaction(add_drug_interactions(newfile), driver) misc.manage_transaction(add_sequences(newfile), driver) misc.manage_transaction(add_experimental_properties(newfile), driver) misc.manage_transaction(add_external_identifiers(newfile), driver) misc.manage_transaction(add_external_equivalents(newfile), driver) misc.manage_transaction(add_pathways_and_relations(newfile), driver) for element in ["enzymes", "carriers", "transporters"]: misc.manage_transaction(add_targets_enzymes_carriers_and_transporters(newfile, element), driver)