Source code for dae.gene.utils

# pylint: disable=invalid-name, unused-variable

import sys
from copy import deepcopy

from box import Box

from dae.configuration.gpf_config_parser import FrozenBox


[docs]def rename_gene_terms(config, gene_terms, inNS): """Rename gene terms.""" assert {gene_terms.geneNS, inNS} <= {"id", "sym"}, ( f'The provided namespaces "{gene_terms.geneNS}",' ' "{inNS}" must be either "id" or "sym"!' ) result = deepcopy(gene_terms) if result.geneNS == inNS: return result if result.geneNS == "id" and inNS == "sym": if not (config.gene_info and config.gene_info.genes): config = getGenes(config) # pylint: disable=inconsistent-return-statements def rF(x): genes = config.gene_info.genes if x in genes: return genes[x].sym result.renameGenes("sym", rF) elif result.geneNS == "sym" and inNS == "id": result.renameGenes( "id", lambda x: getCleanGeneId(config, "sym", x), ) return result
[docs]def getGeneTermAtt(config, gt_id, attName): gene_term = getattr(config.gene_terms, gt_id) return getattr(gene_term, attName)
# def getGeneTerms(config, gt_id="main", inNS="sym"): # fl = getattr(config.gene_terms, gt_id).file # gt = loadGeneTerm(fl) # try: # gt = rename_gene_terms(config, gt, inNS) # except AssertionError: # raise Exception( # ( # f"Unknown namespace(s) for the {gt_id} gene terms:" # " |{gt.geneNS}| -> |{inNS}|" # ) # ) # return gt def _parseNCBIGeneInfo(config): # pylint: disable=too-many-locals genes = {} nsTokens = {} with open(config.gene_info_file) as f: for line in f: if line[0] == "#": # print "COMMENT: ", line continue cs = line.strip().split("\t") if len(cs) != 15: raise ValueError( f"Unexpected line in the {config.gene_info_file}") # Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs # chromosome map_location description # type_of_gene Symbol_from_nomenclature_authority # Full_name_from_nomenclature_authority Nomenclature_status # Other_designations Modification_date # (tab is used as a separator, pound sign - start of a comment) ( tax_id, GeneID, Symbol, LocusTag, Synonyms, dbXrefs, chromosome, map_location, description, type_of_gene, Symbol_from_nomenclature_authority, Full_name_from_nomenclature_authority, Nomenclature_status, Other_designations, Modification_date, ) = cs gi = Box() gi.id = GeneID gi.sym = Symbol gi.syns = set(Synonyms.split("|")) gi.desc = description if gi.id in genes: raise ValueError( f"The gene {gi.id} is repeated twice in the " f"{config.gene_info_file} file") genes[gi.id] = gi _addNSTokenToGeneInfo(nsTokens, "id", gi.id, gi) _addNSTokenToGeneInfo(nsTokens, "sym", gi.sym, gi) for s in gi.syns: _addNSTokenToGeneInfo(nsTokens, "syns", s, gi) print("loaded ", len(genes), "genes", file=sys.stderr) return genes, nsTokens def _addNSTokenToGeneInfo(nsTokens, ns, token, gi): if ns not in nsTokens: nsTokens[ns] = {} tokens = nsTokens[ns] if token not in tokens: tokens[token] = [] tokens[token].append(gi) # pylint: disable=inconsistent-return-statements
[docs]def getCleanGeneId(config, ns, t): """Return clean gene id.""" ns_tokens = getNsTokens(config) if ns not in ns_tokens: return None allTokens = ns_tokens[ns] if t not in allTokens: return None if len(allTokens[t]) != 1: return None return allTokens[t][0].id
[docs]def loadNCBIGeneInfo(config): """Load NCBI Gene Info.""" genes, ns_tokens = _parseNCBIGeneInfo(config.gene_info) config = config.to_dict() config.setdefault("gene_info", {}) config["gene_info"]["genes"] = genes config["gene_info"]["ns_tokens"] = ns_tokens config = FrozenBox(config) return config
[docs]def getGenes(config): if config.gene_info.genes is None: config = loadNCBIGeneInfo(config) return config
[docs]def getNsTokens(config): if config.gene_info.ns_tokens is None: config = loadNCBIGeneInfo(config) return config