diff --git a/.gitignore b/.gitignore index 265d9fba..fa2e6743 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ **/display_tables data/cview.pb data/cview.pb.gz +data/cview.jsonl.gz data/regions.js data/us-states.geo.json data/europe.geo.json @@ -8,4 +9,4 @@ data/hardcoded_clusters.tsv data/NC_045512v2.fa data/ncbiGenes.gtf data/clean.pb -data/cluster_labels.tsv +data/cluster_labels.tsv \ No newline at end of file diff --git a/data/__pycache__/generate_display_tables.cpython-38.pyc b/data/__pycache__/generate_display_tables.cpython-38.pyc index ed662bd8..24df6725 100644 Binary files a/data/__pycache__/generate_display_tables.cpython-38.pyc and b/data/__pycache__/generate_display_tables.cpython-38.pyc differ diff --git a/data/__pycache__/master_backend.cpython-38.pyc b/data/__pycache__/master_backend.cpython-38.pyc index df2ce85f..c8a4ddca 100644 Binary files a/data/__pycache__/master_backend.cpython-38.pyc and b/data/__pycache__/master_backend.cpython-38.pyc differ diff --git a/data/__pycache__/update_js.cpython-38.pyc b/data/__pycache__/update_js.cpython-38.pyc index cd183496..ad6d81f8 100644 Binary files a/data/__pycache__/update_js.cpython-38.pyc and b/data/__pycache__/update_js.cpython-38.pyc differ diff --git a/data/generate_display_tables.py b/data/generate_display_tables.py index efb8c37d..d37d935e 100644 --- a/data/generate_display_tables.py +++ b/data/generate_display_tables.py @@ -1,4 +1,4 @@ -def generate_display_tables(conversion = {}, host = "https://raw.githubusercontent.com/jmcbroome/introduction-website/main/"): +def generate_display_tables(conversion = {}, host = "https://clustertracker.gi.ucsc.edu/", extension = ".jsonl.gz"): filelines = {} def fix_month(datestr): monthswap = {"Jan":"01","Feb":"02","Mar":"03","Apr":"04","May":"05","Jun":"06","Jul":"07","Aug":"08","Sep":"09","Oct":"10","Nov":"11","Dec":"12"} @@ -54,6 +54,21 @@ def fix_month(datestr): header = "Cluster ID\tRegion\tSample Count\tEarliest Date\tLatest Date\tClade\tLineage\tInferred Origins\tInferred Origin Confidences\tGrowth Score\tClick to View" mout = open("cluster_labels.tsv","w+") print("sample\tcluster",file=mout) + def generate_v1_link(cn): + link = "https://taxonium.org/?protoUrl=" + host + "data/cview" + extension + link += '&search=[{"id":0.123,"category":"cluster","value":"' + link += cn + link += '","enabled":true,"aa_final":"any","min_tips":1,"aa_gene":"S","search_for_ids":""}]' + link += '&colourBy={"variable":"region","gene":"S","colourLines":false,"residue":"681"}' + link += "&zoomToSearch=0&blinking=false" + return link + def generate_v2_link(cn): + link = "https://taxonium.org/?protoUrl=" + host + "data/cview" + extension + link += '&srch=[{"key":"aa1","type":"meta_cluster","method":"text_match","text":"' + link += cn + link += '","gene":"S","position":484,"new_residue":"any","min_tips":0,"controls":true}]' + link += "&zoomToSearch=0" + return link for reg, lines in filelines.items(): with open("display_tables/" + conversion[reg] + "_topclusters.tsv", "w+") as outf: print(header,file=outf) @@ -69,28 +84,23 @@ def fix_month(datestr): #generate a link to exist in the last column #based on the global "host" variable. #and including all html syntax. - link = "https://taxonium.org/?protoUrl=" + host + "data/cview.pb.gz" - link += '&search=[{"id":0.123,"category":"cluster","value":"' - link += spent[0] - link += '","enabled":true,"aa_final":"any","min_tips":1,"aa_gene":"S","search_for_ids":""}]' - link += '&colourBy={"variable":"region","gene":"S","colourLines":false,"residue":"681"}' - link += "&zoomToSearch=0&blinking=false" + if extension=="pb.gz": + link = generate_v1_link(spent[0]) + else: + link = generate_v2_link(spent[0]) #additionally process the date strings outline = [spent[0], spent[9], spent[1], fix_month(spent[2]), fix_month(spent[3]), spent[12], spent[13], spent[10], spent[11], spent[4], link] print("\t".join(outline),file=outf) - mout.close() sorted_defaults = sorted(list(zip(default_growthvs,default_lines)),key=lambda x:-x[0]) with open("display_tables/default_clusters.tsv","w+") as outf: print(header,file=outf) for gv,dl in sorted_defaults: spent = dl.split("\t") - link = "https://taxonium.org/?protoUrl=" + host + "data/cview.pb.gz" - link += '&search=[{"id":0.123,"category":"cluster","value":"' - link += spent[0] - link += '","enabled":true,"aa_final":"any","min_tips":1,"aa_gene":"S","search_for_ids":""}]' - link += '&colourBy={"variable":"region","gene":"S","colourLines":false,"residue":"681"}' - link += "&zoomToSearch=0&blinking=false" + if extension=="pb.gz": + link = generate_v1_link(spent[0]) + else: + link = generate_v2_link(spent[0]) outline = [spent[0], spent[9], spent[1], fix_month(spent[2]), fix_month(spent[3]), spent[12], spent[13], spent[10], spent[11], spent[4], link] print("\t".join(outline), file = outf) stateconv = {"AL":"Alabama","AK":"Alaska","AR":"Arkansas","AZ":"Arizona","CA":"California","CO":"Colorado", @@ -103,4 +113,4 @@ def fix_month(datestr): "WA":"Washington","WV":"West Virginia","WI":"Wisconsin","WY":"Wyoming","PR":"Puerto Rico"} stateconv.update({v:v for v in stateconv.values()}) if __name__ == "__main__": - generate_display_tables(stateconv, host = "https://raw.githubusercontent.com/jmcbroome/introduction-website/main/") + generate_display_tables(stateconv, host = "https://clustertracker.gi.ucsc.edu/", extension=".jsonl.gz") diff --git a/data/master_backend.py b/data/master_backend.py index 068324d4..9d865fa1 100644 --- a/data/master_backend.py +++ b/data/master_backend.py @@ -27,7 +27,9 @@ def parse_setup(): parser.add_argument("-t","--threads",type=int,help="Number of threads to use.", default = 4) parser.add_argument("-l","--lexicon",help="Optionally, link to a text file containing all names for the same region, one region per row, tab separated.", default = "") parser.add_argument("-X","--lookahead",type=int,help="Number to pass to parameter -X of introduce. Increase to merge nested clusters. Default 2", default = 2) - parser.add_argument("-H","--host",help="Web-accessible link to the current directory for taxodium cluster view.",default="https://raw.githubusercontent.com/jmcbroome/introduction-website/main/") + parser.add_argument("-V","--taxversion",action='store_true',help="Export the view in Taxonium 2.0 jsonl format instead of taxonium protobuf. Requires the installation of taxoniumtools and adds some compute time.") + parser.add_argument("-H","--host",help="Web-accessible link to the current directory for taxodium cluster view.",default="https://clustertracker.gi.ucsc.edu/") + parser.add_argument("-S","--skip",action='store_true',help="Use to skip inference of introductions and go straight to preparing the data for display. hardcoded_clusters.tsv must already exist.") args = parser.parse_args() return args @@ -39,20 +41,17 @@ def primary_pipeline(args): else: conversion = {} # print(conversion) - print("Calling introduce.") - subprocess.check_call("matUtils introduce -i " + args.input + " -s " + args.sample_regions + " -u hardcoded_clusters.tsv -T " + str(args.threads) + " -X " + str(args.lookahead), shell=True) + if not args.skip: + print("Calling introduce.") + subprocess.check_call("matUtils introduce -i " + args.input + " -s " + args.sample_regions + " -u hardcoded_clusters.tsv -T " + str(args.threads) + " -X " + str(args.lookahead), shell=True) + else: + print("Skipping introduction inference.") print("Updating map display data.") update_js(args.geojson, conversion) - print("Generating top cluster tables.") - generate_display_tables(conversion, host = args.host) + print("Generating top cluster tables.") + generate_display_tables(conversion, host = args.host, extension = ".jsonl.gz" if args.taxversion else ".pb.gz") print("Preparing taxodium view.") sd = {} - # with open("cluster_labels.tsv") as inf: - # for entry in inf: - # spent = entry.strip().split() - # if spent[0] == "sample": - # continue - # sd[spent[0]] = spent[1] with open("hardcoded_clusters.tsv") as inf: for entry in inf: spent = entry.strip().split('\t') @@ -88,8 +87,12 @@ def primary_pipeline(args): spent.append("None") i += 1 print("\t".join(spent),file=outf) - print("Generating viewable pb.") - subprocess.check_call("matUtils extract -i " + args.input + " -M clusterswapped.tsv -F cluster,region --write-taxodium cview.pb --title Cluster-Tracker -g " + args.annotation + " -f " + args.reference,shell=True) + if not args.taxversion: + print("Generating viewable pb.") + subprocess.check_call("matUtils extract -i " + args.input + " -M clusterswapped.tsv -F cluster,region --write-taxodium cview.pb --title Cluster-Tracker -g " + args.annotation + " -f " + args.reference,shell=True) + else: + print("Generating viewable jsonl.") + subprocess.check_call("usher_to_taxonium -i " + args.input + " -m clusterswapped.tsv -c cluster,region -o cview.jsonl.gz --title Cluster-Tracker",shell=True) print("Process completed; check website for results.") if __name__ == "__main__":