Source code for dae.gene.weights

import itertools
from collections import OrderedDict, namedtuple

import numpy as np
import pandas as pd

from dae.gene.gene_sets_db import cached
from dae.utils.dae_utils import join_line


GeneWeightConfig = namedtuple(
    "GeneWeightConfig",
    ["id", "file", "desc", "bins", "xscale", "yscale", "range"]
)


[docs]class GeneWeight: """ Represents gene weights. Loads a CSV file with gene weights by gene weight id as described in `geneInfo.conf`. """ def __init__(self, section_id, config): self.config = config self.id = section_id self.df = None self._dict = None self.genomic_values_col = "gene" self.desc = self.config.desc self.bins = self.config.bins self.xscale = self.config.xscale self.yscale = self.config.yscale self.filename = self.config.file self.range = getattr(self.config, "range", None) self._load_data() self.df.dropna(inplace=True) self.histogram_bins, self.histogram_bars = self._bins_bars() def _load_data(self): assert self.filename is not None df = pd.read_csv(self.filename) assert self.id in df.columns, "{} not found in {}".format( self.id, df.columns ) self.df = df[[self.genomic_values_col, self.id]].copy() return self.df
[docs] def values(self): return self.df[self.id].values
def _bins_bars(self): step = 1.0 * (self.max() - self.min()) / (self.bins - 1) dec = -np.log10(step) dec = dec if dec >= 0 else 0 dec = int(dec) bleft = np.around(self.min(), dec) bright = np.around(self.max() + step, dec) if self.xscale == "log": # Max numbers of items in first bin max_count = self.values().size / self.bins # Find a bin small enough to fit max_count items for bleft in range(-1, -200, -1): if ((self.values()) < 10 ** bleft).sum() < max_count: break bins_in = [0] + list( np.logspace(bleft, np.log10(bright), self.bins) ) else: bins_in = self.bins bars, bins = np.histogram( list(self.values()), bins_in, range=[bleft, bright] ) return (bins, bars)
[docs] def get_gene_value(self, gene_symbol): symbol_values = self._to_dict() return symbol_values[gene_symbol]
@cached def _to_dict(self): """ Returns dictionary of all defined weights keyed by gene symbol. """ if self._dict is None: self._dict = self.df.set_index("gene")[self.id].to_dict() return self._dict @cached def _to_list(self): columns = self.df.applymap(str).columns.tolist() values = self.df.applymap(str).values.tolist() return itertools.chain([columns], values) @cached def to_tsv(self): return map(join_line, self._to_list()) @cached def min(self): """ Returns minimal weight value. """ return self.df[self.id].min() @cached def max(self): """ Returns maximal weight value. """ return self.df[self.id].max()
[docs] def get_genes(self, wmin=None, wmax=None): """ Returns a set of genes which weights are between `wmin` and `wmax`. `wmin` -- the lower bound of weights. If not specified or `None` works without lower bound. `wmax` -- the upper bound of weights. If not specified or `None` works without upper bound. """ df = self.df[self.id] df.dropna(inplace=True) if wmin is None or wmin < df.min() or wmin > df.max(): wmin = float("-inf") if wmax is None or wmax < df.min() or wmax > df.max(): wmax = float("inf") index = np.logical_and(df.values >= wmin, df.values < wmax) genes = self.df[index].gene return set(genes.values)
[docs]class GeneWeightsDb(object): """ Helper class used to load all defined gene weights. Used by Web interface. """ def __init__(self, config): super(GeneWeightsDb, self).__init__() self.config = config self.weights = OrderedDict() self._load()
[docs] @staticmethod def load_gene_weight_from_file( filename, bins=150, xscale="linear", yscale="linear", desc=None, range=None, ): config = GeneWeightConfig( id=filename.split(".")[0], file=filename, desc=desc, bins=bins, xscale=xscale, yscale=yscale, range=range, ) return GeneWeight(config)
@cached def get_gene_weight_ids(self): return list(self.weights.keys()) @cached def get_gene_weights(self): return [self.get_gene_weight(weight_id) for weight_id in self.weights]
[docs] def get_gene_weight(self, weight_id): assert self[weight_id].df is not None return self[weight_id]
def _load(self): if self.config and self.config.gene_weights: for section_id, weight_config in self.config.gene_weights.items(): if ( section_id in self.config.gene_info.selected_gene_weights ): w = GeneWeight(section_id, weight_config) self.weights[section_id] = w def __getitem__(self, weight_id): if weight_id not in self.weights: raise ValueError("unsupported gene weight {}".format(weight_id)) res = self.weights[weight_id] if res.df is None: res.load_weights() return res def __contains__(self, weight_id): return weight_id in self.weights def __len__(self): return len(self.weights)