Skip to content

Commit

Permalink
Restore autostreamtree folder from commit 0c79596
Browse files Browse the repository at this point in the history
  • Loading branch information
tkchafin committed Feb 7, 2024
1 parent 8b5c0c7 commit a35fbb5
Show file tree
Hide file tree
Showing 46 changed files with 7,088 additions and 0 deletions.
Empty file added autostreamtree/__init__.py
Empty file.
54 changes: 54 additions & 0 deletions autostreamtree/aggregators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import numpy as np
import scipy


def aggregate_dist(method, stuff):
"""
Aggregates the given array according to the specified method.
Args:
method (str): The aggregation method to use (e.g. "HARM", "ARITH",
"GEOM", etc.).
stuff (array-like): The input array to be aggregated.
Returns:
float: The aggregated result.
"""
if method == "HARM":
stuff = [x for x in stuff if x > 0]
return scipy.stats.hmean(stuff) if len(stuff) > 0 else 0.0
elif method == "ARITH":
return np.mean(stuff)
elif method == "GEOM":
return scipy.stats.mstats.gmean(stuff)
elif method == "MEDIAN":
return np.median(stuff)
elif method == "MAX":
return np.max(stuff)
elif method == "MIN":
return np.min(stuff)
elif method == "ADJHARM":
return adjusted_harmonic_mean(stuff)
elif method == "SD":
return np.std(stuff)
elif method == "VAR":
return np.var(stuff)


def adjusted_harmonic_mean(stuff):
"""
Computes an adjusted harmonic mean that is corrected for non-positive
values.
Args:
stuff (array-like): The input array.
Returns:
float: The adjusted harmonic mean.
"""
s = np.array(stuff)
vals = s[s > 0.0]
bads = s[s <= 0.0]
mu = (1.0 / (np.sum([1.0/x for x in vals]) /
(len(vals)-len(bads)))) * ((len(vals)-len(bads))/len(vals))
return mu
318 changes: 318 additions & 0 deletions autostreamtree/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
import sys
import pandas as pd
import numpy as np
import geopandas as gpd
import pyogrio
import random
import matplotlib.pyplot as plt
import time

import autostreamtree.functions as ast
from autostreamtree.params import parseArgs
import autostreamtree.cluster_pops as clust
import autostreamtree.report_refs as ref
import autostreamtree.aggregators as agg


def main():

params = parseArgs()

# Set the seed for random number generation
if params.seed is not None:
np.random.seed(params.seed)
random.seed(params.seed)
else:
# Use clock time to seed RNG if params.seed is None
clock_seed = int(time.time() * 1000) % (2**32)
np.random.seed(clock_seed)
random.seed(clock_seed)

# Step 1: Reading in datasets
print("Running autostream tree", params.run, "workflow\n")
G = ast.read_network(params.network, params.shapefile)
# prune graph if list of edges passed
if params.edge_list:
df = pd.read_csv(params.edge_list, sep='\t')
edge_list = df[params.reachid_col].tolist()
G = ast.prune_graph(G, edge_list, params.reachid_col)

# read point coordinate data
points = pd.read_csv(params.geodb, sep="\t", header=0)
(point_coords, pop_coords, popmap) = ast.process_samples(params, points, G)

# read genetic data from VCF
if params.run != "STREAMDIST":
seqs = ast.read_vcf(params.vcf, concat=params.concat, popmap=popmap)

# Step 2: Calculating genetic distance matrix(-ces)
# Optional: Calculate separately for each locus

gen = None
pop_gen = None
# calculate genetic distance matrix
if params.run != "STREAMDIST" and params.run != "RUNLOCI":
if not params.genmat:
print("\nCalculating genetic distances...")
(gen, pop_gen) = ast.get_gendist_mats(params, point_coords, popmap,
seqs)
else:
print("\nReading genetic distances from provided matrix:",
params.genmat)
inmat = pd.read_csv(params.genmat, header=0, index_col=0, sep="\t")
(gen, pop_gen) = ast.parse_input_genmat(params, inmat,
point_coords, popmap, seqs)

ast.report_genmats(params, gen, pop_gen, point_coords, pop_coords)

# exit if this was the only step requested
if params.run == "GENDIST":
sys.exit(0)

# IMPORTANT NOTE: This function will silently skip any loci for which
# calculations aren't possible (e.g., population having no data)
if params.run == "RUNLOCI":
genlist = list()
popgenlist = list()
for loc in ast.get_loc_data(seqs):
try:
(gen, pop_gen) = ast.get_gendist_mats(params, point_coords,
popmap, loc)
genlist.append(gen)
popgenlist.append(pop_gen)
except Exception:
pass
print(
"\nCalculating average of locus-wise genetic distance matrices..."
)
if genlist[0] is not None:
gen = np.mean(genlist, axis=0)
else:
gen = genlist[0]
if popgenlist[0] is not None:
pop_gen = np.mean(popgenlist, axis=0)
else:
pop_gen = popgenlist[0]

ast.report_genmats_list(params, genlist, popgenlist, point_coords,
pop_coords)

# Step 4: Constructing a minimal subgraph

# Extract subgraph if needed
if params.run != "GENDIST":
K = ast.parse_subgraph_from_points(params, point_coords, pop_coords, G)

# Step 5: Construct stream distance matrix
# Optional: Mantel test

# calculate pairwise observed stream distances and indence matrix
# calculate incidence matrix X, which takes the form:
# nrows = rows in column vector form of D
# ncols = number of collapsed branches in stream network K
if params.run in ["STREAMDIST", "DISTANCES", "STREAMTREE", "IBD", "ALL",
"RUNLOCI"]:
if params.pop or params.geopop or params.clusterpop:
points = pop_coords
gen = pop_gen
else:
points = point_coords

# calculate stream distances and incidence matrix
(sdist, inc) = ast.get_stream_mats(points, K, params.length_col)
print("\nStream distances:")
print(sdist)
sDF = pd.DataFrame(sdist, columns=list(points.keys()),
index=list(points.keys()))
sDF.to_csv((str(params.out) + ".streamDistMat.txt"), sep="\t",
index=True)
del sDF
if params.run == "STREAMDIST":
sys.exit(0)

# if user requested testing isolation-by-distance:
if params.run in ['ALL', 'IBD']:
# IBD calculations and plots
print("\nTesting for isolation-by-distance using Mantel test with",
params.permutations, "permutations")
ast.test_ibd(gen, sdist, params.out, params.permutations)

# plot geographic x genetic DISTANCES
ast.plot_gen_by_geo(gen, sdist, params.out)

# if log request, re-do with logged geographic distances
if params.and_log:
print("\nTesting for isolation-by-distance with log",
"geographic distances using Mantel test with",
params.permutations, "permutations")
out = params.out+".log"
ast.test_ibd(gen, sdist, out, params.permutations, log=True)
ast.plot_gen_by_geo(gen, sdist, out, log=True)

#########################################################
# Step 6: Least-squares fitting
# Optional: Weighted least-squares
# Optional: Fit for each locus
#########################################################

if params.run in ["STREAMTREE", "ALL", "RUNLOCI"]:
if params.pop or params.geopop or params.clusterpop:
gen = pop_gen
print("\nIncidence matrix:")
print(inc)
ofh = params.out+".incidenceMatrix.txt"
with np.printoptions(precision=0, suppress=True):
np.savetxt(ofh, inc, delimiter="\t")
print("Incidence matrix dimensions:")
print(inc.shape)

# fit least-squares branch lengths
if params.run != "RUNLOCI":
print()
R = ast.fit_least_squares_distances(gen, inc.astype(int),
params.iterative,
params.out, params.weight)
print("\nFitted least-squares distances:")
print(R)
else:
print()
if params.pop or params.geopop or params.clusterpop:
genlist = popgenlist
print("Fitting StreamTree distances on per-locus matrices...")
Rlist = list()
ast.block_print()
for gen in genlist:
r = ast.fit_least_squares_distances(gen, inc.astype(int),
params.iterative,
params.out, params.weight)
Rlist.append(r)
ast.enable_print()

# aggregate locus distances (mean)
print("\nCalculating average fitted distances across loci using:",
params.loc_agg)
averageR = np.array(
[agg.aggregate_dist(params.loc_agg, col) for
col in zip(*Rlist)])
print(averageR)

# get standard-deviation locus distances as well
print("\nCalculating standard-deviation of fitted distances",
"across loci using:", params.loc_agg)
sdR = np.array([agg.aggregate_dist("SD", col) for
col in zip(*Rlist)])
print(sdR)

# check observed versus fitted distances:
if params.run == "RUNLOCI":
R = averageR
pred = ast.get_fitted_d(points, gen, inc, R)
print("\nComparing observed versus predicted genetic distances:")
print(pred)
ast.output_fitted_d(pred, params.out)

# Step 7: Outputs

# Now, annotate originate geoDF with dissolved reach IDs
# also, need to collect up the stream tree fitted D to each reach
# finally, could add residuals of fitting D vs LENGTH_KM?
# maybe include logDxlength, DxlogLength, logDxlogLength as well?

# get list of all REACHIDs to extract from geoDF
# edge_data = nx.get_edge_attributes(K,params.reachid_col)
reach_to_edge = dict()
i = 0
edges = list()
for e in K.edges():
edge_data = K[e[0]][e[1]][params.reachid_col]
for r in edge_data:
reach_to_edge[r] = str(i)
edges.append(i)
i += 1
del edge_data

# save reach_to_edge table to file
r2eDF = pd.DataFrame(list(reach_to_edge.items()),
columns=[params.reachid_col, 'EDGE_ID'])
r2eDF.to_csv((str(params.out)+".reachToEdgeTable.txt"),
sep="\t", index=False)

# read in original shapefile as geoDF and subset it
print("\nExtracting attributes from original dataframe...")
# geoDF = gpd.read_file(params.shapefile)
geoDF = pyogrio.read_dataframe(params.shapefile)
mask = geoDF[params.reachid_col].isin(list(reach_to_edge.keys()))
temp = geoDF.loc[mask]
del mask
del reach_to_edge

# join EDGE_ID to geoDF
geoDF = temp.merge(r2eDF, on=params.reachid_col, how='left')
for col in geoDF.columns:
if col.endswith('_x'):
geoDF.drop(col, axis=1, inplace=True)
elif col.endswith('_y'):
geoDF.rename(columns={col: col.rstrip('_y')}, inplace=True)
print(geoDF)
del temp
del r2eDF

# plot by edge ID
base = geoDF.plot(column="EDGE_ID", cmap="prism")
coords = clust.coords_to_dataframe(points)
geo_coords = gpd.GeoDataFrame(coords,
geometry=gpd.points_from_xy(coords.long,
coords.lat))
geo_coords.plot(ax=base, marker='o', color='black', markersize=10,
zorder=10)

plt.title("Stream network colored by EDGE_ID")
plt.savefig((str(params.out)+".streamsByEdgeID.pdf"))

# add in fitted distances & plot
print("\nPlotting StreamTree fitted distances and writing new",
"shapefile...")

if params.run == "RUNLOCI":
fittedD = pd.DataFrame({'EDGE_ID': list(edges), 'fittedD': R})
sdD = pd.DataFrame({'EDGE_ID': list(edges), 'stdevD': sdR})
i = 1
for locfit in Rlist:
name = "locD_" + str(i)
fittedD[name] = locfit
i += 1
else:
fittedD = pd.DataFrame({'EDGE_ID': list(edges), 'fittedD': R})
geoDF['EDGE_ID'] = geoDF['EDGE_ID'].astype(int)
# geoDF.drop(columns=["EDGE_ID_x", "EDGE_ID_y"], inplace=True)
geoDF = geoDF.merge(fittedD, on='EDGE_ID', how="left")
for col in geoDF.columns:
if col.endswith('_x'):
geoDF.drop(col, axis=1, inplace=True)
elif col.endswith('_y'):
geoDF.rename(columns={col: col.rstrip('_y')}, inplace=True)
if params.run == "RUNLOCI":
geoDF = geoDF.merge(sdD, on='EDGE_ID')
geoDF.plot(column="stdevD", cmap="RdYlGn_r", legend=True)
plt.title("Stream network colored by standard deviation of",
"StreamTree fitted distances")
plt.savefig((str(params.out)+".streamsBystdevD.pdf"))
geoDF.plot(column="fittedD", cmap="RdYlGn_r", legend=True)
plt.title("Stream network colored by StreamTree fitted distances")
plt.savefig((str(params.out)+".streamsByFittedD.pdf"))

# output a final annotated stream network layer
geoDF.to_csv((str(params.out)+".streamTree.txt"), sep="\t",
index=False)
ast.write_geodataframe(geoDF, (str(params.out)+".streamTree"),
params.output_driver)

refs = ref.fetch_references(params)
print(refs)

print("\nDone!\n")


# Call main function
if __name__ == '__main__':
main()
Loading

0 comments on commit a35fbb5

Please sign in to comment.