#!/usr/bin/env python
import argparse
import glob
import logging
import os
import sys
from pathlib import Path
import yaml
from dae.gene.gene_sets_db import GeneSet, build_gene_set_collection_from_file
from dae.gene.gene_term import (
read_ewa_set_file,
read_gmt_file,
read_mapping_file,
)
logger = logging.getLogger("create_sqlite_gene_set")
# pylint: disable=consider-using-with, too-many-locals, too-many-branches
# pylint: disable=too-many-statements
[docs]def main(argv):
"""Entry point for the tool."""
parser = argparse.ArgumentParser()
parser.add_argument("--verbose", "-V", action="count", default=0)
parser.add_argument(
"--resource",
dest="resource",
help="path to genomic_resource.yaml to use",
default=None,
)
parser.add_argument(
"--format",
dest="format",
help="format of gene set (map, gmt, directory)",
default=None,
)
parser.add_argument(
"--filename",
dest="filename",
help="path to gmt or map file",
default=None,
)
parser.add_argument(
"--directory",
dest="directory",
help="path to directory collection data",
default=None,
)
parser.add_argument(
"--output",
dest="output",
help="where to write sqlite DB file",
default="collection_db",
)
argv = parser.parse_args(argv)
if argv.verbose == 1:
logging.basicConfig(level=logging.WARNING)
elif argv.verbose == 2:
logging.basicConfig(level=logging.INFO)
elif argv.verbose >= 3:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.WARNING)
if argv.resource is not None:
with open(argv.resource) as file:
content = file.read()
resource_config = yaml.safe_load(content)
base_dir = os.path.dirname(argv.resource)
collection_id = resource_config["id"]
collection_format = resource_config["format"]
if collection_format == "map":
filename = resource_config["filename"]
filepath = os.path.join(base_dir, filename)
names_filename = filename[:-4] + "names.txt"
names_filepath = os.path.join(base_dir, names_filename)
names_file = None
if os.path.exists(names_filepath):
names_file = open(names_filepath)
gene_terms = read_mapping_file(
open(filepath),
names_file,
)
elif collection_format == "gmt":
filename = resource_config["filename"]
filepath = os.path.join(base_dir, filename)
gene_terms = read_gmt_file(open(filepath))
elif collection_format == "directory":
directory = os.path.join(
base_dir, resource_config["directory"],
)
filepaths = glob.glob(f"{directory}/*.txt")
files = [
(Path(f).stem, open(f))
for f in filepaths
]
gene_terms = read_ewa_set_file(files)
else:
raise ValueError("Invalid collection format type")
elif argv.format is not None:
raise NotImplementedError
else:
raise ValueError("Invalid arguments")
db = build_gene_set_collection_from_file(
argv.output,
collection_id=collection_id,
collection_format="sqlite",
)
for key, value in gene_terms.tDesc.items():
syms = list(gene_terms.t2G[key].keys())
gene_set = GeneSet(key, value, syms)
db.add_gene_set(gene_set)
if __name__ == "__main__":
main(sys.argv[1:])