Source code for CanGraph.main

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# SPDX-FileCopyrightText: 2022 Pablo Marcos <software@loreak.org>
#
# SPDX-License-Identifier: MIT

"""
A python module that leverages the functions present in the :obj:`~CanGraph.miscelaneous`
module and all other subpackages to annotate metabolites using a graph format and Neo4J,
and then provides an GraphML export file.

CanGraph.main Usage
---------------------

To use this module:

.. argparse::
   :module: CanGraph.main
   :func: args_parser
   :prog: python3 main.py
   :nodefault:

You may find more info in the package's README.

.. NOTE:: For this program to work, the Git environment **has to be set up first**.
    You can ensure this by using: :obj:`CanGraph.setup.setup_git`

CanGraph.main Functions
-------------------------

This module is comprised of:
"""

# Import external modules necessary for the script
from alive_progress import alive_bar # A cute progress bar that shows the script is still running
import rdkit                         # Cheminformatics and ML package
import pandas as pd                  # Analysis of tabular data
import os, sys, shutil               # Vital modules to interact with the filesystem
import rdkit                         # Cheminformatics and ML package
from rdkit.Chem import MACCSkeys     # MACCS fingerprint calculation
from Bio import SeqIO                # Bioinformatics package
import re                            # Regular expression search
import argparse                      # Arguments pàrser for Python
from contextlib import redirect_stdout # Redirect stdout, to not show things on the stdout
from copy import deepcopy            # Do deep copies of python objects
import bioservices                   # Query web bio-databases from python
import ijson                         # Read JSON files from Python in an iterative way
import logging                       # Make ``verbose`` messages easier to show

# Import internal modules for the program
import miscelaneous as misc
from GraphifyDrugBank import build_database as DrugBankDataBase
from GraphifyHMDB import build_database as HumanMetabolomeDataBase
from GraphifySMPDB import build_database as SmallMoleculePathWayDataBase
from ExposomeExplorer import build_database as ExposomeExplorerDataBase
from QueryWikidata import build_database as WikiDataBase
from MeSHandMetaNetX import build_database as MeSHandMetaNetXDataBases

[docs]def args_parser(): """ Parses the command line arguments into a more usable form, providing help and more Returns: argparse.ArgumentParser: A dictionary of the different possible options for the program as keys, specifying their set value. If no command-line arguments are provided, the help message is shown and the program exits. .. NOTE:: Note that, in Google Docstrings, if you want a multi-line ``Returns`` comment, you have to start it in a different line :( .. NOTE:: The return **must** be of type :obj:`argparse.ArgumentParser` for the ``argparse`` directive to work and auto-gen docs .. NOTE:: By using :obj:`argparse.const` instead of :obj:`argparse.default`, the check_file function will check "" (the current dir, always exists) if the arg is not provided, not breaking the function; if it is, it checks it. """ parser = argparse.ArgumentParser( description = "A python utility to study and analyse cancer-associated metabolites " "using knowledge graphs ") parser.add_argument("-c", "--check_args", action="store_true", help="Checks if the rest of the arguments are OK, then exits") parser.add_argument("-n", "--noindex", action="store_true", help="Runs the program checking each file one-by-one, instead of using a JSON index") parser.add_argument("-s", "--similarity", action="store_true", default=True, help="Deactivates the import of information based on Structural Similarity." "This might dramatically increase processing time; default is True.") parser.add_argument("-w", "--webdbs", action="store_true", default=True, help="Activates import of information based on web databases." "This might dramatically increase processing time; default is True.") parser.add_argument("-i", "--interactive", action="store_true", default=True, help=("tells the script if it wants interaction from the user " "and more information shown to them; similar to --verbose")) parser.add_argument("--query", type=misc.check_file, required=True, help="The location of the CSV file in which the program will search for metabolites") parser.add_argument("--dbfolder", type=misc.check_file, default="DataBases", help="The folder indicated to ```setup.py``` as the one where your databases will be stored; " "default is ``./DataBases``") parser.add_argument("--results", default="Results", help="The folder where the resulting GraphML exports will be stored; " "default is ``./Results``") parser.add_argument("--adress", help="the URL of the database, in neo4j:// or bolt:// format", type=misc.check_neo4j_protocol, default="bolt://localhost:7687") parser.add_argument("--username", help="the username of the neo4j database in use", default="neo4j") parser.add_argument("--password", help="the password for the neo4j database in use. NOTE: " "Since passed through bash, you may need to escape some chars", default="neo4j") # If no args are provided, show the help message if len(sys.argv)==1: parser.print_help(sys.stderr) sys.exit(1) if "--check_args" in sys.argv: parser.parse_args() exit(0) return parser
[docs]def improve_search_terms_with_metanetx(query, query_type, driver, chebi_ids, names, hmdb_ids, inchis, mesh_ids): """ Improves the search terms already provided to the CanGraph programme by using the MetaNetX web service to find synonyms in IDs Args: query (str): The term we are currently querying for query_type (str): The kind of query to search; one of ["ChEBI_ID", "HMDB_ID", "Name", "InChI", "MeSH_ID"] driver (neo4j.Driver): Neo4J's Bolt Driver currently in use chebi_ids (str): A string of ";" separated values of all the ChEBI_ID representing the current metabolite names (list): A string of ";" separated values of all the Name representing the current metabolite hmdb_ids (list): A string of ";" separated values of all the HMDB_ID representing the current metabolite inchis (list): A string of ";" separated values of all the InChI representing the current metabolite mesh_ids (list): A string of ";" separated values of all the MeSH_ID representing the current metabolite Returns: list: A list containing [ chebi_ids, names, hmdb_ids, inchis, mesh_ids ], with all their synonyms """ # First in MetaNetX with driver.session() as session: # For MeSH, we cannot search for synonyms in MetaNetX (it doesn't index them), so we instead # look for related metabolites in MeSH itself, so that the import makes more sense if query_type == "MeSH_ID": graph_response = misc.manage_transaction( MeSHandMetaNetXDataBases.find_metabolites_related_to_mesh(query), driver ) else: graph_response = misc.manage_transaction( MeSHandMetaNetXDataBases.read_synonyms_in_metanetx(query_type, query), driver ) for element in graph_response: element = {key: element[key] for key in element if element[key] != None } # With .get, the lookup does not fail, even if query_type == "MeSH_ID" if element.get("databasename", "").lower() == "hmdb": if element["databaseid"] not in hmdb_ids: hmdb_ids.append(element.get("databaseid", "")) if element.get("databasename", "").lower() == "chebi": if element["databaseid"] not in chebi_ids: chebi_ids.append(element.get("databaseid", "")) if element.get("InChI", "").lower() not in inchis: inchis.append(element.get("InChI", "")) if element.get("Name", "").lower() not in names: names.append(element.get("Name", "")) return [ chebi_ids, names, hmdb_ids, inchis, mesh_ids ]
[docs]def improve_search_terms_with_cts(query, query_type, chebi_ids, names, hmdb_ids, inchis, mesh_ids): """ Improves the search terms already provided to the CanGraph programme by using The Chemical Translation Service to find synonyms in IDs Args: query (str): The term we are currently querying for query_type (str): The kind of query to search; one of ["ChEBI_ID", "HMDB_ID", "Name", "InChI", "MeSH_ID"] driver (neo4j.Driver): Neo4J's Bolt Driver currently in use chebi_ids (str): A string of ";" separated values of all the ChEBI_ID representing the current metabolite names (list): A string of ";" separated values of all the Name representing the current metabolite hmdb_ids (list): A string of ";" separated values of all the HMDB_ID representing the current metabolite inchis (list): A string of ";" separated values of all the InChI representing the current metabolite mesh_ids (list): A string of ";" separated values of all the MeSH_ID representing the current metabolite Returns: list: A list containing [ chebi_ids, names, hmdb_ids, inchis, mesh_ids ], with all their synonyms """ # We get the correct names for the identifiers we want to turn into: query_dict_keys = ["ChEBI_ID", "HMDB_ID", "Name", "InChI", "MeSH_ID"] oldIdentifier = [ x for x in query_dict_keys if x not in [query_type, "MeSH_ID"] ] replace_keys = {"ChEBI_ID":"ChEBI", "HMDB_ID":"Human Metabolome Database", "Name":"Chemical Name", "InChI":"InChIKey"} toIdentifierList = list(map(replace_keys.get, oldIdentifier, oldIdentifier)) fromIdentifier = replace_keys.get(query_type, query_type) # Convert the InChI to InChIKeys (required by CTS) if query_type == "InChI": rdkit_mol = rdkit.Chem.MolFromInchi(query) searchTerm = rdkit.Chem.MolToInchiKey(rdkit_mol) else: searchTerm = query # And run the CTS search: for toIdentifier in toIdentifierList: result = MeSHandMetaNetXDataBases.find_synonyms_in_cts(fromIdentifier, toIdentifier, searchTerm) for element in result: # And append its results to the correct list if toIdentifier == "Human Metabolome Database" and element not in hmdb_ids: hmdb_ids.append(element) if toIdentifier == "ChEBI" and element.replace("CHEBI:", "") not in chebi_ids: chebi_ids.append(element.replace("CHEBI:", "")) if toIdentifier == "Chemical Name" and element not in names: names.append(element) if toIdentifier == "InChIKey": # Convert key to InChI unichem = bioservices.UniChem() inchis_from_unichem = unichem.get_inchi_from_inchikey(element) inchis.append(inchis_from_unichem[0]["standardinchi"]) return [ chebi_ids, names, hmdb_ids, inchis, mesh_ids ]
[docs]def improve_search_terms(driver, chebi_ids, names, hmdb_ids, inchis, mesh_ids): """ Improves the search terms already provided to the CanGraph programme by processing the text stings and finding synonyms in various platforms Args: driver (neo4j.Driver): Neo4J's Bolt Driver currently in use chebi_ids (str): A string of ";" separated values of all the ChEBI_ID representing the current metabolite names (list): A string of ";" separated values of all the Name representing the current metabolite hmdb_ids (list): A string of ";" separated values of all the HMDB_ID representing the current metabolite inchis (list): A string of ";" separated values of all the InChI representing the current metabolite mesh_ids (list): A string of ";" separated values of all the MeSH_ID representing the current metabolite Returns: list: A list containing [ chebi_ids, names, hmdb_ids, inchis, mesh_ids ], with all their synonyms """ # First, we convert the strings we have received from the function call into lists, by splitting for ";": replace_chebis = re.compile(re.escape('ChEBI:'), re.IGNORECASE); replace_meshes = re.compile(re.escape('MeSH:'), re.IGNORECASE) chebi_ids = replace_chebis.sub('', chebi_ids).split(";") mesh_ids = replace_meshes.sub('', mesh_ids).split(";") names = names.split(";"); hmdb_ids = hmdb_ids.split(";"); inchis = inchis.split(";") # Then, we may search for synonyms for each field using MetaNetX # To do this, we organize a dict so that we can iterate more easily query_dict = {"ChEBI_ID": chebi_ids, "HMDB_ID": hmdb_ids, "InChI": inchis, "Name": names, "MeSH_ID": mesh_ids} query_dict = deepcopy(query_dict) # Make a deepcopy of the dict so that it doesn't update while on the loop # And then, proceed to search for synonyms and append the appropriate results if they are not already there with alive_bar( sum( [ len(v) for v in query_dict.values() ] ) + 4, title="Finding Synonyms...") as bar: for query_type, query_list in query_dict.items(): for query in query_list: if not query.isspace() and query: chebi_ids, names, hmdb_ids, inchis, mesh_ids = ( improve_search_terms_with_metanetx(query, query_type, driver, chebi_ids, names, hmdb_ids, inchis, mesh_ids)) bar() # Once we have all the synonyms that we could find on MeSHandMetaNetX, we remove all duplicates chebi_ids = list(filter(None, set(chebi_ids))); names = list(filter(None, set(names))); inchis = list(filter(None, set(inchis))); mesh_ids = list(filter(None, set(mesh_ids))) hmdb_ids = list(filter(None, set(hmdb_ids))) # And re-generate the query_dict query_dict = {"ChEBI_ID": chebi_ids, "HMDB_ID": hmdb_ids, "InChI": inchis, "Name": names, "MeSH_ID": mesh_ids} query_dict = deepcopy(query_dict) # Make a deepcopy of the dict so that it doesn't update while on the loop # And search for even more synonyms on CTS, the Chemical Translation Service! # This is done separately because, while MeSHandMetaNetX do not overlap, they do overlap with CTS for query_type, query_list in {key: query_dict[key] for key in query_dict if key != 'MeSH_ID' and any(query_dict[key]) }.items(): for query in query_list: chebi_ids, names, hmdb_ids, inchis, mesh_ids = ( improve_search_terms_with_cts(query, query_type, chebi_ids, names, hmdb_ids, inchis, mesh_ids)) bar() # Finally, we remove outdated HMDB IDs regex = re.compile(r'^HMDB\d\d\d\d\d$') hmdb_ids = [i for i in hmdb_ids if regex.match(str(i))] # And return the simplified versions of the lists return [ list(filter(None, set(chebi_ids))), list(filter(None, set(names))), list(filter(None, set(hmdb_ids))), list(filter(None, set(inchis))), list(filter(None, set(mesh_ids))) ]
[docs]def find_reasons_to_import_inchi(query, subject): """ Takes two chains of text and finds if the ``query`` is present in the ``subject``, or if there are molecules common between them with at least 95% similarity Args: query (str or list): A string or list of strings describing valid InChI(s) subject (str): A valid InChI Returns: dict: A dict with each query as a key and the reason to import it as value, if there is one. .. seealso:: This approach was taken from `Chemistry StackExchange #82144 <https://chemistry.stackexchange.com/questions/82144/what-is-the-correct-regular-expression-for-inchi>`_ .. NOTE:: Since this is a one-to-one comparison, subject and query can be used interchangeably; however, bear in mind that only the query can be provided as a list """ reasons_to_import_inchi = {} queries = list(query) try: MolSubject = rdkit.Chem.MolFromInchi(subject) MACCSSubject = MACCSkeys.GenMACCSKeys(MolSubject) except: return for each_query in queries: found_error = False try: Query = rdkit.Chem.MolFromInchi(f"{each_query}") MACCSQuery = MACCSkeys.GenMACCSKeys(Query) except Exception as error: found_error = True pass if found_error == False: DICE_MACCS = rdkit.DataStructs.DiceSimilarity(MACCSQuery, MACCSSubject) if DICE_MACCS > 0.95: dice_maccs_import = f"DICE-MACCS {100*round(DICE_MACCS, 4)} % similarity" reasons_to_import_inchi.setdefault(each_query, dice_maccs_import) return reasons_to_import_inchi
[docs]def find_reasons_to_import_all_files(filepath, similarity, chebi_ids, names, hmdb_ids, inchis, mesh_ids): """ Finds reasons to import a metabolite given a candidate filepath **with one metabolite per file** and a series of lists containing all synonyms of the values considered reasons for import Args: filepath (str): The path to the file in which we will search for reasons to import similarity (bool): Whether to use similarity as a measure to import or not chebi_ids (list): A list of all the ChEBI_ID which are considered a reason to import names (list): A list of all the Name which are considered a reason to import hmdb_ids (list): A list of all the HMDB_ID which are considered a reason to import inchis (list): A list of all the InChI which are considered a reason to import mesh_ids (list): A list of all the MeSH_ID which are considered a reason to import Returns: list: A list of the methods that turned out to be valid for import, such as Name, ChEBI_ID... """ import_based_on = []; text = "" importing_ids = {} with open(f'{filepath}', "r") as f: text = f.read() # We try to find exact InChI matches: if any((match:= inchi) in text for inchi in inchis): import_based_on.append("Exact InChI") importing_ids.setdefault("InChI", []).append(match) # If none if found, we use the "Similarity Evaluator" metric elif "InChI=" in text and any(inchis) and similarity: results = re.search("InChI\=1S?\/[A-Za-z0-9\.]+(\+[0-9]+)?(\/[cnpqbtmsih][A-Za-z0-9\-\+\(\)\,\/\?\;\.]+)*(\"|\<)", text) if results: result = results.group(0).replace("<","").replace("\"","") reason_to_import = find_reasons_to_import_inchi(inchis, result) if reason_to_import: import_based_on.append(list(set(reason_to_import.values()))) importing_ids.setdefault("InChI", []).append(match) # For CHEBI, if we are using E-E or SMPDB, and since they dont have a prefix # (i.e. they are only a number) we have to process the files. if "ExposomeExplorer/components" in filepath or "SMPDB/smpdb_metabolites" in filepath: component = pd.read_csv(os.path.abspath(filepath), dtype = str) if "ExposomeExplorer/components" in filepath: if str(component["chebi_id"][0]) != "nan": chebi_query = list(component["chebi_id"][0]) else: chebi_query = [] elif "SMPDB/smpdb_metabolites" in filepath: chebi_query = list(component["ChEBI ID"]) # NOTE: Here, we remove the optional CHEBI: prefix chebi_subject = [chebi_id.replace("CHEBI:", "").replace("chebi:", "") for chebi_id in chebi_ids] # And we check the two lists against each other if any((match:= item) in chebi_query for item in chebi_subject) and chebi_query: import_based_on.append("ChEBI_ID") importing_ids.setdefault("ChEBI_ID", []).append(match) else: # And, even if its not E-E, we still need to add the tag before for things to match for chebi_query in chebi_ids: replace_tag_exp = re.compile(re.escape('ChEBI:'), re.IGNORECASE) if f"<chebi_id>{replace_tag_exp.sub('', chebi_query)}" in text: import_based_on.append("ChEBI_ID") importing_ids.setdefault("ChEBI_ID", []).append(match) # For MeSH, we just check for the tags for mesh_id in mesh_ids: replace_tag_exp = re.compile(re.escape('MeSH:'), re.IGNORECASE) if f"<mesh-id>{replace_tag_exp.sub('', mesh_id)}" in text: import_based_on.append("MeSH_ID") importing_ids.setdefault("MeSH_ID", []).append(match) # For the rest of the databases, we simply search for exact matches in our list and the texts: if any((match:= hmdb) in text for hmdb in hmdb_ids): import_based_on.append("HMDB_ID") importing_ids.setdefault("HMDB_ID", []).append(match) if any((match:= name) in text for name in names): import_based_on.append("Name") importing_ids.setdefault("Name", []).append(match) # We return a list of a dict from keys to remove duplicates from the "reasons to import" list return list(dict.fromkeys(import_based_on)), importing_ids
[docs]def build_from_file(filepath, Neo4JImportPath, driver): """ Imports a given metabolite from a sigle-metabolite containing file by checking its type and calling the appropriate import functions. Args: filepath (str): The path to the file in which will be imported Neo4JImportPath (str): The path which Neo4J will use to import data driver (neo4j.Driver): Neo4J's Bolt Driver currently in use Returns: This function does not provide a particular return, but rather imports the requested file .. NOTE:: The ``filepath`` may be absolute or relative, but it is transformed to a relative ``relpath`` in order to remove possible influence of higher-name folders in the import type selection. This is also why the condition is stated as a big "if/elif/else" instead of a series of "ifs" """ relpath = os.path.relpath(filepath, ".") filepath = os.path.abspath(filepath) fixedpath = os.path.basename(filepath).replace(" ", "_") shutil.copyfile(filepath, f"{Neo4JImportPath}/{fixedpath}") if "DrugBank" in relpath: DrugBankDataBase.build_from_file(f"{fixedpath}", driver) os.remove(f"{Neo4JImportPath}/{os.path.basename(fixedpath)}") elif "HMDB" in relpath: if "protein" in relpath: HumanMetabolomeDataBase.build_from_protein_file(f"{fixedpath}", driver) elif "metabolite" in relpath: HumanMetabolomeDataBase.build_from_metabolite_file(f"{fixedpath}", driver) os.remove(f"{Neo4JImportPath}/{os.path.basename(fixedpath)}") elif "SMPDB" in relpath: # NOTE: Since this adds a ton of low-resolution nodes, maybe have this db run first? # We will ignore the smpdb_pathways file because it doesnt have "real" identifiers if "proteins" in relpath: SmallMoleculePathWayDataBase.build_from_file(filepath, Neo4JImportPath, driver, "Protein") if "metabolites" in relpath: SmallMoleculePathWayDataBase.build_from_file(filepath, Neo4JImportPath, driver, "Metabolite") elif "ExposomeExplorer/components" in relpath: # NOTE: Since only "components" can result in a match based on our current criteria, # we will build the DB starting with the components only. # Here, instead of using shutil.copyfile, we will use pandas to purge the _count columns when copying original_file = pd.read_csv(filepath)[[x for x in open(f"{filepath}").readline().rstrip().split(",") if not x.endswith('_count')]] original_file.to_csv(f"{Neo4JImportPath}/{os.path.basename(filepath)}", index=False) with driver.session() as session: misc.manage_transaction(ExposomeExplorerDataBase.add_components(os.path.basename(filepath)), driver) os.remove(f"{Neo4JImportPath}/{os.path.basename(filepath)}") ExposomeExplorerDataBase.build_from_file( os.path.dirname(filepath), Neo4JImportPath, driver, keep_counts_and_displayeds = False)
[docs]def import_based_on_all_files(all_files, Neo4JImportPath, driver, similarity, chebi_ids, names, hmdb_ids, inchis, mesh_ids): """ A function that searches inside a series of lists, provided as arguments, and imports the metabolites matching those present in them iterating over a list of files which may contain relevant information to be imported Args: all_files(list): A list of all the posible files where we want to look for info Neo4JImportPath (str): The path which Neo4J will use to import data driver (neo4j.Driver): Neo4J's Bolt Driver currently in use similarity (bool): Whether to use similarity as a measure to import or not chebi_ids (list): A list of all the ChEBI_ID which are considered a reason to import names (list): A list of all the Name which are considered a reason to import hmdb_ids (list): A list of all the HMDB_ID which are considered a reason to import inchis (list): A list of all the InChI which are considered a reason to import mesh_ids (list): A list of all the MeSH_ID which are considered a reason to import """ # Set up the progress bar with alive_bar( len(all_files), title="Scanning all files for matches...") as bar: i = 0 # Initialize counter for verbose messages on slurn # And search for them in the all_files list we created earlier on based on a series of criteria: for filepath in all_files: import_based_on, importing_ids = find_reasons_to_import_all_files(filepath, similarity, chebi_ids, names, hmdb_ids, inchis, mesh_ids) # Once we know the reasons to import (this is done so that it only cycles one # time through the code), we import the files themselves if import_based_on: build_from_file(filepath, Neo4JImportPath, driver) for item_type, items in importing_ids.items(): for item in items: # We add the OriginalMetabolite label here; this will necessarily create duplicates, # but this will be handled later on when the DB gets purged with driver.session() as session: misc.manage_transaction(link_to_original_data(item_type, item, import_based_on), driver) if i % 15000 == 0 and i > 1: logging.info(f"Scanned file: {i} / {len(all_files)}") i += 1; bar() # And advance, of course
[docs]def import_based_on_index(databasefolder, Neo4JImportPath, driver, similarity, chebi_ids, names, hmdb_ids, inchis, mesh_ids): """ A function that searches inside a series of lists, provided as arguments, and imports the metabolites matching those present in them using a JSON file to map the bits of the databases where the relevant information lies Args: databasefolder (str): The main folder where all the databases we will be using are to be found There *must* be an index.json file located in ``databasefolder``/index.json Neo4JImportPath (str): The path which Neo4J will use to import data driver (neo4j.Driver): Neo4J's Bolt Driver currently in use similarity (bool): Whether to use similarity as a measure to import or not chebi_ids (list): A list of all the ChEBI_ID which are considered a reason to import names (list): A list of all the Name which are considered a reason to import hmdb_ids (list): A list of all the HMDB_ID which are considered a reason to import inchis (list): A list of all the InChI which are considered a reason to import mesh_ids (list): A list of all the MeSH_ID which are considered a reason to import """ # Set the databasefolder to be an absolute path databasefolder = os.path.abspath(databasefolder) # We assume there is an index at the ``databasefolder`` index_path = f"{databasefolder}/index.json" # Define the function we will use for deciding on importing a given file def find_reasons_to_import_from_index(item_list, item_type, previous_files, driver): # We use ijson to prevent collapse when reading the huge index file index_file = open(index_path, "r") for record in ijson.items(index_file, item_type): for item in item_list: # We initialize the list of reasons to import this specific ID import_based_on = [] # And get the files to import based on exact matches of the ID on index files_to_import = record.get(item, "") for each_file in list(record.get(item, "")): if each_file not in previous_files: previous_files.append(each_file) import_based_on.append(item_type) # Only if exact inchi search fails, it makes sense to do similarity search if item_type == "InChI" and "InChI" not in import_based_on and similarity: all_inchis_on_index = record.keys() reason_to_import = find_reasons_to_import_inchi(all_inchis_on_index, item) if reason_to_import: import_based_on.append(list(set(reason_to_import.values()))) for each_file in list(record.get(item, "")): if each_file not in previous_files: previous_files.append(each_file) if import_based_on: # We add the OriginalMetabolite label here; this will necessarily create duplicates, # but this will be handled later on when the DB gets purged with driver.session() as session: misc.manage_transaction(link_to_original_data(item_type, item, import_based_on), driver) index_file.close() return previous_files all_files = [] # Calculate the files that we will need to import logging.info(f"Finding Reasons to Import...") all_files = find_reasons_to_import_from_index(chebi_ids, "ChEBI_ID", all_files, driver) all_files = find_reasons_to_import_from_index(hmdb_ids, "HMDB_ID", all_files, driver) all_files = find_reasons_to_import_from_index(mesh_ids, "MeSH_ID", all_files, driver) all_files = find_reasons_to_import_from_index(names, "Name", all_files, driver) all_files = find_reasons_to_import_from_index(inchis, "InChI", all_files, driver) i = 0 # And import them with alive_bar(len(all_files), title="Importing Selected Files...") as bar: for each_file in all_files: filepath = f"{databasefolder}/{each_file}" # We can only import a file if it exists, of course; files may not actually # exist if using an index generated with a different number of DBs / DB version if os.path.isfile(filepath): build_from_file(filepath, Neo4JImportPath, driver) if i % 150 == 0 and i > 1: logging.info(f"Importing file: {i} / {len(all_files)}") i += 1; bar() # And advance, of course
[docs]def annotate_using_wikidata(driver): """ Once we finish the search, we annotate the nodes added to the database using WikiData Args: driver (neo4j.Driver): Neo4J's Bolt Driver currently in use Returns: This function modifies the Neo4J Database as desired, but does not produce any particular return. .. TODO:: When fixing queries, fix the main subscript also """ with driver.session() as session, alive_bar( 53, title="Querying WikiData...") as bar: misc.manage_transaction(WikiDataBase.add_wikidata_and_mesh_by_name(), driver); bar() # The ``query`` param is, remember, so as to remove the wikidata_id search which is by default misc.manage_transaction(WikiDataBase.add_metabolite_info(query = "ChEBI_ID"), driver); bar() misc.manage_transaction(WikiDataBase.add_drug_external_ids(query = "DrugBank_ID"), driver); bar() misc.manage_transaction(WikiDataBase.add_more_drug_info(query = "DrugBank_ID"), driver); bar() misc.manage_transaction(WikiDataBase.find_subclass_of_disease(), driver); bar() misc.manage_transaction(WikiDataBase.find_subclass_of_disease(), driver); bar() misc.manage_transaction(WikiDataBase.find_subclass_of_disease(), driver); bar() misc.manage_transaction(WikiDataBase.find_instance_of_disease(), driver); bar() # For each of the 10 numbers a wikidata_id may have as ending for number in range(10): misc.manage_transaction(WikiDataBase.add_disease_info(number=number), driver); bar() misc.manage_transaction(WikiDataBase.add_drugs(number=number), driver); bar() misc.manage_transaction(WikiDataBase.add_causes(number=number), driver); bar() misc.manage_transaction(WikiDataBase.add_genes(number=number), driver); bar() misc.manage_transaction(WikiDataBase.add_drug_external_ids(), driver); bar() misc.manage_transaction(WikiDataBase.add_more_drug_info(), driver); bar() misc.manage_transaction(WikiDataBase.add_yet_more_drug_info(), driver); bar() misc.manage_transaction(WikiDataBase.add_gene_info(), driver); bar() misc.manage_transaction(WikiDataBase.add_metabolite_info(), driver); bar()
[docs]def add_mesh_and_metanetx(driver): """ Add MeSH Term IDs, Synonym relations and Protein interactions to existing nodes using MeSH and MetaNetX Also, adds Kegg Pathway IDs Args: driver (neo4j.Driver): Neo4J's Bolt Driver currently in use Returns: This function modifies the Neo4J Database as desired, but does not produce any particular return. """ with driver.session() as session, alive_bar( 10, title="Querying MeSH and MetaNetX...") as bar: # We will also add MeSH terms to all nodes: misc.manage_transaction(MeSHandMetaNetXDataBases.add_mesh_by_name(), driver); bar() # We also add synonyms: misc.manage_transaction(MeSHandMetaNetXDataBases.write_synonyms_in_metanetx("Name"), driver); bar() misc.manage_transaction(MeSHandMetaNetXDataBases.write_synonyms_in_metanetx("KEGG_ID"), driver); bar() misc.manage_transaction(MeSHandMetaNetXDataBases.write_synonyms_in_metanetx("ChEBI_ID"), driver); bar() misc.manage_transaction(MeSHandMetaNetXDataBases.write_synonyms_in_metanetx("HMDB_ID"), driver); bar() misc.manage_transaction(MeSHandMetaNetXDataBases.write_synonyms_in_metanetx("InChI"), driver); bar() misc.manage_transaction(MeSHandMetaNetXDataBases.write_synonyms_in_metanetx("InChIKey"), driver); bar() # And some protein interactions, together with their pathways, too: misc.manage_transaction(MeSHandMetaNetXDataBases.find_protein_data_in_metanetx(), driver); bar() misc.manage_transaction(MeSHandMetaNetXDataBases.find_protein_interactions_in_metanetx(), driver); bar() misc.manage_transaction(MeSHandMetaNetXDataBases.get_kegg_pathways_for_metabolites(), driver); bar()
[docs]def main(): """ The function that executes the code .. NOTE:: This function disables rdkit's log messages, since rdkit seems to dislike the way some of the InChI strings it is getting from the databases are formatted .. TODO:: CAMBIAR NOMBRE A LOS MESH PARA INDICAR EL TIPO. AÑADIR NAME A LOS WIKIDATA .. TODO:: FIX THE REPEAT TRANSACTION FUNCTION .. TODO:: Match partial InChI based on DICE-MACCS .. TODO:: QUE FUNCIONE -> ACTUALMENTE ESTA SECCION RALENTIZA MAZO .. TODO:: CHECK APOC IS INSTALLED .. TODO:: FIX MAIN .. TODO:: MERGE BY INCHI, METANETX ID .. TODO:: Fix find_protein_interactions_in_metanetx .. TODO:: Mover esa funcion de setup a misc .. TODO:: EDIT conf.py .. TODO:: Document the following Schema Changes: * For Subject, we have a composite PK: Exposome_Explorer_ID, Age, Gender e Information * Now, more diseases will have a WikiData_ID and a related MeSH. This will help with networking. And, this diseases dont even need to be a part of a cancer! * The Gene nodes no longer exist in the full db? -> They do """ # Parse the command line arguments # Done first in order to show errors if bad commands are issued parser = args_parser(); args = parser.parse_args() # If the session is set to be interactive, display the logging messages logging.basicConfig(format='%(message)s') if args.interactive: logging.getLogger().setLevel(logging.INFO) # We may also disable rdkit's log messages rdkit.RDLogger.DisableLog('rdApp.*') # And we read the query file raw_database = pd.read_csv(args.query, delimiter=',', header=0) # And connect to the Neo4J database driver = misc.connect_to_neo4j(args.adress, args.username, args.password) Neo4JImportPath = misc.get_import_path(driver) logging.info("Connected to Neo4J") if not os.path.exists(f"{os.path.abspath(args.dbfolder)}/index.json") and not args.noindex: print("No index file was found. Switching to non-indexed mode..."); args.noindex = True # For each item in our "query" file, we will try to find matches: for index, row in raw_database.iterrows(): # We start by cleaning the database (important if this is not the first run) with driver.session() as session: misc.manage_transaction( misc.clean_database(), driver) logging.info("Cleaned DataBase") print(f"Searching Synonyms for Metabolite {index+1}/{len(raw_database)} ...") # Then, we fill the empty values so that the program does not crash row.fillna("", inplace=True) # And prevent non-specified columns from crashing the program chebi_ids = row.ChEBI_ID if "ChEBI_ID" in raw_database.columns else "" names = row.Name if "Name" in raw_database.columns else "" hmdb_ids = row.HMDB_ID if "HMDB_ID" in raw_database.columns else "" inchis = row.InChI if "InChI" in raw_database.columns else "" mesh_ids = row.MeSH_ID if "MeSH_ID" in raw_database.columns else "" chebi_ids, names, hmdb_ids, inchis, mesh_ids = improve_search_terms(driver, chebi_ids, names, hmdb_ids, inchis, mesh_ids) print(f"Annotating Metabolite {index+1}/{len(raw_database)} using Built-In DataBases...") if args.noindex: # We prepare a scan of all the files available on our "DataBases" folder # We will cycle through them to try and find matches all_files = misc.scan_folder(args.dbfolder) all_files = [ x for x in all_files if "index.json" not in x] import_based_on_all_files(all_files, Neo4JImportPath, driver, args.similarity, chebi_ids, names, hmdb_ids, inchis, mesh_ids) else: import_based_on_index(args.dbfolder, Neo4JImportPath, driver, args.similarity, chebi_ids, names, hmdb_ids, inchis, mesh_ids) # We first purge the database by deleting useless noded that might overcharge the web queries misc.purge_database(driver, method = "delete") # Finally, we apply some functions that, although they could be run each time, # are so resource intensive that its better to just use once: if args.webdbs: # First, check if there are more than 10.000 nodes. result = misc.manage_transaction("MATCH (n) RETURN COUNT( " "DISTINCT n ) AS nodes", driver) number_of_nodes = result[0]["nodes"] # If there are less than that, run the webdbs; else, skip # This is because, with lots of nodes, neo4j is unable to call the webdbs if number_of_nodes < 10000: print(f"Annotating Metabolite {index+1}/{len(raw_database)} using Web DataBases...") # We annotate the existing nodes using WikiData annotate_using_wikidata(driver) # Add their MeSH and MetaNetX IDs and synonyms add_mesh_and_metanetx(driver) else: print(f"{number_of_nodes} nodes found on the database, which is too much to call the Web DBs") print(f"without risk of failure. Skipping Web DBs for this metabolite...") # And purge any duplicates once again misc.purge_database(driver) # And save it in GraphML format with driver.session() as session: misc.manage_transaction(misc.export_graphml( f"metabolite_{index+1}.graphml"), driver) if not os.path.exists(args.results): os.makedirs(os.path.abspath(args.results)) shutil.copyfile(f"{Neo4JImportPath}/metabolite_{index+1}.graphml", f"{os.path.abspath(args.results)}/metabolite_{index+1}.graphml") os.remove(f"{Neo4JImportPath}/metabolite_{index+1}.graphml") # Remove original to avoid errors logging.info(f"Metabolite {index+1}/{len(raw_database)} processed. You can find " f"a copy of the associated knowledge graph at " f"{os.path.abspath(args.results)}/metabolite_{index+1}.graphml") logging.info("Metabolite Processing has finished")
if __name__ == '__main__': main()