diff --git a/code/cluster.q b/code/cluster.q
index 3f981fb..36f0bff 100644
--- a/code/cluster.q
+++ b/code/cluster.q
@@ -1,72 +1,308 @@
+// code/cluster.q - Nlp clustering utilities
+// Copyright (c) 2021 Kx Systems Inc
+// 
+// Clustering utilites for textual data 
+
 \d .nlp
 
-// Run on either docs or keyword dicts
-cluster.i.asKeywords:{i.fillEmptyDocs $[-9=type x[0]`keywords;x;x`keywords]}
-
-// Get cohesiveness of cluster as measured by mean sum of squares error
-cluster.MSE:{[docs]
-  $[0=n:count docs;0n;1=n;1.;0=sum count each docs;0n;
-    avg d*d:0^compareDocToCentroid[i.takeTop[50]i.fastSum docs]each i.fillEmptyDocs docs]}
-
-// Bisecting k-means algo (repeatedly splits largest cluster in 2)
-cluster.bisectingKMeans:{[docs;k;n]
-  if[0=n:count docs:cluster.i.asKeywords docs;:()];
-  (k-1){[n;docs;clusters]
-    cluster:clusters idx:i.minIndex cluster.MSE each docs clusters;
-    (clusters _ idx),cluster@/:cluster.kmeans[docs cluster;2;n]
-  }[n;docs]/enlist til n}
-
-// k-means clustering for docs
-cluster.kmeans:{[docs;k;n]
-  n{[docs;clusters]
-    centroids:(i.takeTop[3]i.fastSum@)each docs clusters;
-    value group i.maxIndex each centroids compareDocs\:/:docs
-  }[docs]/(k;0N)#neg[nd]?nd:count docs:cluster.i.asKeywords docs}
-
-// Match each doc to nearest centroid
-cluster.groupByCentroids:{[centroids;docs]
-  value group{[centroids;doc]$[0<m:max s:compareDocs[doc]each centroids;s?m;0n]}[centroids]each docs}
-
-// Merge any clusters with significant overlap into a single cluster
+// @private
+// @kind function
+// @category nlpClusteringUtility
+// @desc Extract the keywords from a list of documents or keyword
+//   dictionary
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @returns {dictionary[]} Keyword dictionaries
+cluster.i.asKeywords:{[parsedTab]
+  keyWords:$[-9=type parsedTab[0]`keywords;parsedTab;parsedTab`keywords];
+  i.fillEmptyDocs keyWords
+  }
+
+// @private
+// @kind function
+// @category nlpClusteringUtility
+// @desc Split the document into clusters using kmeans
+// @param iters {long} The number of times to iterate the refining step
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param clusters {long} Cluster indices
+// @returns {long[][]} The documents' indices, grouped into clusters
+cluster.i.bisect:{[iters;parsedTab;clusters]
+  idx:i.minIndex cluster.MSE each parsedTab clusters;
+  cluster:clusters idx;
+  (clusters _ idx),cluster@/:cluster.kmeans[parsedTab cluster;2;iters]
+  }
+
+// @private
+// @kind function
+// @category nlpClusteringUtility
+// @desc Apply k-means clustering to a document
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param clusters {long[]} Cluster indices
+// @returns {long[][]} The documents' indices, grouped into clusters
+cluster.i.kmeans:{[parsedTab;clusters]
+  centroids:(i.takeTop[3]i.fastSum@)each parsedTab clusters;
+  value group i.maxIndex each centroids compareDocs\:/:parsedTab
+  }
+
+// @private
+// @kind function
+// @category nlpClusteringUtility
+// @desc Find nearest neighbor of document
+// @param centroids {dictionary[]} Centroids as keyword dictionaries
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @returns {long[][]} Document indices 
+cluster.i.findNearestNeighbor:{[centroids;doc]
+  similarities:compareDocs[doc] each centroids;
+  m:max similarities;
+  $[m>0f;similarities?m;0n]
+  }
+
+// @private
+// @kind function
+// @category nlpClusteringUtility
+// @desc Merge any clusters with significant overlap into a single 
+//   cluster
+// @param clusters {any[][]} Cluster indices
+// @returns {any[][]} Appropriate clusters merged together
 cluster.i.mergeOverlappingClusters:{[clusters]
-  similarClusters:{[clusters;counts;idx]
-    superset:counts=sum each clusters[idx]in/:clusters;
-    similar:.5<=avg each clusters[idx]in/:clusters;
-    notSmaller:(count clusters idx)>=count each clusters;
-    where superset or(similar & notSmaller)
-  }[clusters;count each clusters]each til count clusters;
-  merge:1<count each similarClusters;
-  similarClusters:distinct desc each similarClusters where merge;
+  counts:count each clusters;
+  similar:cluster.i.similarClusters[clusters;counts]each til count clusters;
+  // Merge any cluster that has at least one similar cluster
+  // A boolean vector of which clusters will be getting merged
+  merge:1<count each similar;
+  // Filter out clusters of 1, and remove duplicates
+  similarClusters:distinct desc each similar where merge;
+  // Do the actual merging of the similar clusters
   newClusters:(distinct raze@)each clusters similarClusters;
+  // Clusters not involved in any merge
+  // This can't just be (not merge), as that only drops the larger cluster,
+  // not the smaller one, in each merge
   untouchedClusters:(til count clusters)except raze similarClusters;
-  clusters[untouchedClusters],newClusters}
+  clusters[untouchedClusters],newClusters
+  }
+
+// @private
+// @kind function
+// @category nlpClusteringUtility
+// @desc Group together clusters that share over 50% of their elements
+// @param clusters {any[][]} Cluster indices
+// @param counts {long} Count of each cluster
+// @param idx {long} Index of cluster
+// @return {any[][]} Clusters grouped together
+cluster.i.similarClusters:{[clusters;counts;idx]
+  superset:counts=sum each clusters[idx]in/:clusters;
+  similar:.5<=avg each clusters[idx]in/:clusters;
+  notSmaller:(count clusters idx)>=count each clusters;
+  where superset or(similar & notSmaller)
+  }
+
+// @private
+// @kind function
+// @category nlpClusteringUtility
+// @desc Normalize the columns of a matrix so they sum to 1
+// @param matrix {float[][]} Numeric matrix of values 
+// @returns {float[][]} The normalized columns
+cluster.i.columnNormalize:{[matrix]
+  0f^matrix%\:sum matrix
+  }
+
+// @private
+// @kind function
+// @category nlpClusteringUtility
+// @desc Graph clustering that works on a similarity matrix
+// @param matrix {boolean[][]} NxN adjacency matrix
+// @returns {long[][]} Lists of indices in the corpus where each row 
+//   is a cluster
+cluster.i.similarityMatrix:{[matrix]
+  matrix:"f"$matrix;
+  // Make the matrix stochastic and run MCL until stable
+  normMatrix:cluster.i.columnNormalize matrix;
+  attractors:cluster.i.MCL/[normMatrix];
+  // Use output of MCL to get the clusters
+  clusters:where each attractors>0;
+  // Remove empty clusters and duplicates
+  distinct clusters where 0<>count each clusters
+  }
 
-// Extremely fast clustering algo for large datasets (produces small but cohesive clusters)
-cluster.radix:{[docs;n]
-  reduced:{distinct 4#key desc x}each docs:cluster.i.asKeywords docs;
-  keywords:(where 5<=count each group raze reduced)except`;
+// @private
+// @kind function
+// @category nlpClusteringUtility
+// @desc SM Van Dongen's MCL clustering algorithm
+// @param matrix {float[][]} NxN matrix
+// @return {float[][]} MCL algorithm applied to matrix
+cluster.i.MCL:{[matrix]
+  // Expand matrix by raising to the nth power (currently set to 2)
+  do[2-1;mat:{i.np[`:matmul;x;x]`}matrix];
+  mat:cluster.i.columnNormalize mat*mat;
+  @[;;:;0f] ./:flip(mat;where each(mat>0)&(mat<.00001))
+  }
+
+// @kind function
+// @category nlpClustering
+// @desc Uses the top ten keywords of each document in order to cluster
+//   similar documents together  
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param k {long} The number of clusters to return
+// @returns {long[][]} The documents' indices grouped into clusters
+cluster.summarize:{[parsedTab;k]
+  if[0=count parsedTab;:()];
+  docs:i.takeTop[10]each cluster.i.asKeywords parsedTab;
+  summary:i.fastSum[docs]%count docs;
+  centroids:();
+  do[k;
+    // Find the document that summarizes the corpus best
+    // and move that document to the centroid list
+    centroids,:nearest:i.maxIndex docs[;i.maxIndex summary];
+    summary-:docs nearest;
+    summary:(where summary<0)_ summary
+    ];
+  cluster.groupByCentroids[docs centroids;docs]
+  }
+
+// @kind function
+// @category nlpClustering
+// @desc Use the top 50 keywords of each document to calculate the 
+//   cohesiveness as measured by the mean sum of sqaures
+// @param keywords {dictionary[]} A parsed document containing keywords and 
+//   their associated significance scores
+// @returns {float} The cohesion of the cluster
+cluster.MSE:{[parsedTab]
+  n:count parsedTab;
+  if[(0=n)|0=sum count each parsedTab,(::);:0n];
+  if[1=n;:1f];
+  centroid:i.takeTop[50]i.fastSum parsedTab;
+  docs:i.fillEmptyDocs parsedTab;
+  // Don't include the current document in the centroid, or for small clusters
+  // it just reflects its similarity to itself
+  dists:0^compareDocToCentroid[centroid]each docs;
+  avg dists*dists
+  }
+
+// @kind function
+// @category nlpClustering
+// @desc The bisecting k-means algorithm which uses k-means to 
+//   repeatedly split the most cohesive clusters into two clusters
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param k {long} The number of clusters to return
+// @param iters {long} The number of times to iterate the refining step
+// @returns {long[][]} The documents' indices, grouped into clusters
+cluster.bisectingKMeans:{[parsedTab;k;iters]
+  docs:cluster.i.asKeywords parsedTab;
+  if[0=n:count docs;:()];
+  (k-1)cluster.i.bisect[iters;docs]/enlist til n
+  }
+
+// @kind function
+// @category nlpClustering
+// @desc k-means clustering for documents
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param k {long} The number of clusters to return
+// @param iters {long} The number of times to iterate the refining step
+// @returns {long[][]} The documents' indices, grouped into clusters
+cluster.kmeans:{[parsedTab;k;iters]
+  docs:cluster.i.asKeywords parsedTab;
+  numDocs:count docs;
+  iters cluster.i.kmeans[docs]/(k;0N)#neg[numDocs]?numDocs
+  }
+
+// @kind function
+// @category nlpClustering
+// @desc Given a list of centroids and a list of documents, match each
+//   document to its nearest centroid
+// @param centroids {dictionary[]} Centroids as keyword dictionaries
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @returns {long[][]} Lists of document indices where each list is a cluster
+//   N.B. These don't line up with the number of centroids passed in,
+//   and the number of lists returned may not equal the number of centroids.
+//   There can be documents which match no centroids (all of which will end up 
+//   in the same group), and centroids with no matching documents.
+cluster.groupByCentroids:{[centroids;parsedTab]
+  // If there are no centroids, everything is in one group
+  if[not count centroids;:enlist til count parsedTab];
+  value group cluster.i.findNearestNeighbor[centroids]each parsedTab
+  }
+
+// @kind function
+// @category nlpClustering
+// @desc Uses the Radix clustering algorithm and bins are taken from 
+//   the top 3 terms of each document
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param k {long} The number of clusters desired, though fewer may
+//   be returned. This must be fairly high to cover a substantial amount of the
+//   corpus, as clusters are small
+// @returns {long[][]} The documents' indices, grouped into clusters
+cluster.radix:{[parsedTab;k]
+  docs:cluster.i.asKeywords parsedTab;
+  // Bin on keywords, taking the 3 most significant keywords from each document
+  // and dropping those that occur less than 3 times  
+  reduced:{distinct 4#key desc x}each docs; 
+  // Remove any keywords that occur less than 5 times
+  keywords:where (count each group raze reduced) >= 5;
+  keywords:keywords except `;
   clusters:{[reduced;keyword]where keyword in/:reduced}[reduced]each keywords;
+  // Score clusters based on the harmonic mean of their cohesion and log(size)
   cohesion:i.normalize cluster.MSE each docs clusters;
   size:i.normalize log count each clusters;
   score:i.harmonicMean each flip(cohesion;size);
-  sublist[n]cluster.i.mergeOverlappingClusters/[clusters sublist[2*n]idesc score]}
+  // Take the n*2 highest scoring clusters, as merging will remove some
+  // but don't run it on everything, since merging is expensive.
+  // This may lead to fewer clusters than expected if a lot of merging happens
+  clusters:clusters sublist[2*k]idesc score;
+  sublist[k]cluster.i.mergeOverlappingClusters/[clusters]
+  }
 
-cluster.fastRadix:{[docs;n]
-  docs:cluster.i.asKeywords docs;
-  grouped:(group i.maxIndex each docs)_`;
+// @kind function
+// @category nlpClustering
+// @desc Uses the Radix clustering algorithm and bins by the most 
+//   significant term
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param k {long} The number of clusters desired, though fewer may
+//   be returned. This must be fairly high to cover a substantial amount of the
+//   corpus, as clusters are small
+// @returns {long[][]} The documents' indices, grouped into clusters
+cluster.fastRadix:{[parsedTab;k]
+  docs:cluster.i.asKeywords parsedTab;
+  // Group documents by their most significant term
+  grouped:group i.maxIndex each docs;
+  // Remove the entry for empty documents
+  grouped:grouped _ `;
+  // Remove all clusters containing only one element
   clusters:grouped where 1<count each grouped;
+  // Score clusters based on the harmonic mean of their cohesion and log(size)
   cohesion:i.normalize cluster.MSE each docs clusters;
   size:i.normalize log count each clusters;
   score:i.harmonicMean each flip(cohesion;size);
-  clusters sublist[n]idesc score}
+  // Return the n highest scoring clusters
+  clusters sublist[k]idesc score
+  }
 
-// Cluster a subcorpus using graph clustering
-cluster.MCL:{[docs;mn;sample]
-  docs:cluster.i.asKeywords docs;
-  keywords:docs idx:$[sample;(neg"i"$sqrt count docs)?count docs;til count docs];
-  similarities:i.matrixFromRaggedList i.compareDocToCorpus[keywords]each til count keywords;
+// @kind function
+// @category nlpClustering
+// @desc Cluster a subcorpus using graph clustering
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param minimum {float} The minimum similarity that will be considered
+// @param sample {boolean} If this is true, a sample of sqrt(n) documents is
+//   used, otherwise all documanets are used
+// @returns {long[][]} The documents' indices, grouped into clusters
+cluster.MCL:{[parsedTab;minimum;sample]
+  docs:cluster.i.asKeywords parsedTab;
+  idx:$[sample;(neg"i"$sqrt count docs)?count docs;til count docs];
+  keywords:docs idx;
+  n:til count keywords;
+  similarities:i.matrixFromRaggedList compareDocToCorpus[keywords]each n;
   // Find all the clusters
-  clustersOfOne:1=count each clusters:cluster.i.similarityMatrix similarities>=mn;
+  clusters:cluster.i.similarityMatrix similarities>=minimum;
+  clustersOfOne:1=count each clusters;
   if[not sample;:clusters where not clustersOfOne];
   // Any cluster of 1 documents isn't a cluster, so throw it out
   outliers:raze clusters where clustersOfOne;
@@ -76,33 +312,5 @@ cluster.MCL:{[docs;mn;sample]
   centroids:avg each keywords clusters;
   // Move each non-outlier to the nearest centroid
   nonOutliers:(til count docs)except idx outliers;
-  nonOutliers cluster.groupByCentroids[centroids;docs nonOutliers]}
-
-// Graph clustering that works on a similarity matrix
-cluster.i.columnNormalize:{[mat]0f^mat%\:sum mat}
-cluster.i.similarityMatrix:{[mat]
-  matrix:"f"$mat;
-  // SM Van Dongen's MCL clustering algorithm
-  MCL:{[mat]
-    // Expand matrix by raising to the nth power (currently set to 2)
-    do[2-1;mat:{i.np[`:matmul;x;x]`}mat];
-    mat:cluster.i.columnNormalize mat*mat;
-    @[;;:;0f] ./:flip(mat;where each(mat>0)&(mat<.00001))};
-  // Make the matrix stochastic and run MCL until stable
-  attractors:MCL/[cluster.i.columnNormalize mat];
-  // Use output of MCL to get the clusters
-  clusters:where each attractors>0;
-  // Remove empty clusters and duplicates
-  distinct clusters where 0<>count each clusters}
-
-// Subtracts most representive elements from centroid & iterate until number of clusters reached
-cluster.summarize:{[docs;n]
-  if[0=count docs;:()];
-  docs:i.takeTop[10]each cluster.i.asKeywords docs;
-  summary:i.fastSum[docs]%count docs;
-  centroids:();
-  do[n;
-    centroids,:nearest:i.maxIndex docs[;i.maxIndex summary];
-    summary-:docs nearest;
-    summary:(where summary<0)_ summary];
-  cluster.groupByCentroids[docs centroids;docs]}
+  nonOutliers cluster.groupByCentroids[centroids;docs nonOutliers]
+  }
diff --git a/code/dateTime.q b/code/dateTime.q
new file mode 100644
index 0000000..0b7f580
--- /dev/null
+++ b/code/dateTime.q
@@ -0,0 +1,184 @@
+// code/dateTime.q - Nlp time utilities
+// Copyright (c) 2021 Kx Systems Inc
+//
+// Utilities for handling dates and times
+
+\d .nlp
+
+// @private
+// @kind function
+// @category nlpTimeUtility
+// @desc Pads a string containing a single integer to two digits
+//   or extracts the last 2 digits from a string
+// @param day {string} Contains a date
+// @returns {string} Padded date to two digits
+tm.i.parseDay:{[day]
+  -2#"0",day where day in .Q.n
+  }
+
+// @private
+// @kind dictionary
+// @category nlpTimeUtility
+// @desc Dictionary mapping the months of the year 
+// @type dictionary
+//   to a symbol denoting integer representation
+tm.i.months:`jan`feb`mar`apr`may`jun`jul`aug`sep`oct`nov`dec!`$string 1+til 12
+
+// @private
+// @kind function
+// @category nlpTimeUtility
+// @desc Convert a long-form or short-form month string to 
+//   a string denoting the month as an integer "feb"/"february"
+//   become "02"
+// @param day {string} A month of the year in English
+// @returns {string} A padded integer representing the month of the year
+tm.i.parseMonth:{[month]
+  -2#"0",string month^tm.i.months month:lower`$3 sublist month
+  }
+
+// @private
+// @kind function
+// @category nlpTimeUtility
+// @desc Pad a string denoting a year to 4 digits
+//   if input > 35 this is deemed to be 1900s 
+//   i.e. "20" -> "2020" / "44" -> "1944")
+// @param year {string} Contains a year
+// @returns {string} Padded year value
+tm.i.parseYear:{[year]
+  -4#$[35<"I"$-2#year;"19";"20"],year
+  }
+
+// @private
+// @kind function
+// @category nlpTimeUtility
+// @desc Convert year string to the entire date
+//   encapsulating that year
+// @param year {string} A year 
+// @returns {string} Date range from Jan 1 to Dec 31 of
+//   the specified year
+tm.i.convY:{[year]
+  "D"$year,/:(".01.01";".12.31")
+  }
+
+// @private
+// @kind function
+// @category nlpTimeUtility
+// @desc Convert string containing yearMonth
+//   to the date range encapsulating that month
+//   i.e. "test 2020.02" -> 2020.02.01 2020.02.29
+//        "2019.02 test" -> 2019.02.01 2019.02.28
+// @param text {string} Text containing yearMonth value
+// @returns {string} Date range for the month of the
+//   provided yearMonth
+tm.i.convYearMonth:{[text]
+  txt:regex.matchAll[;text]each regex.objects`year`month;
+  matches:ungroup([format:"ym"]txt);
+  updMatches:matches,'flip`txt`s`e!flip matches`txt;
+  matches:value select format,last txt by s from updMatches;
+  format:tm.i.formatYM/[matches`format];
+  format:raze@[format;i where 1<count each i:group format;:;" "];
+  0 -1+"d"$0 1+"M"$"."sv tm.i[`parseYear`parseMonth]@'matches[`txt]idesc format
+  }
+
+// @private
+// @kind function
+// @category nlpTimeUtility
+// @desc Seperate YearMonth formats to year and month
+//   i.e "ym" -> "y","m"
+// @params ym {string[]} The format for each date objecct
+// @returns {string} Formats of YearMonths objects seperated
+tm.i.formatYM:{[ym]
+  @[ym;where not counts;except[;raze ym where counts:1=count each ym]]
+  }
+
+// @private
+// @kind function
+// @category nlpTimeUtility
+// @desc Convert string containing yearMonthDay
+//   to the date range encapsulating that day
+//   i.e. "test 2020.01.01" -> 2020.01.01 2020.01.01
+//        "2010.01.01 test" -> 2010.01.01 2010.01.01
+// @param text {string} Text containing yearMonthDay value 
+// @returns {string} Date range associated with the
+//   provided yearMonthDay
+tm.i.convYearMonthDay:{[text]
+  txt:regex.matchAll[;text]each regex.objects`year`month`day;
+  matches:ungroup([format:"ymd"]txt);
+  updMatches:matches,'flip`txt`s`e!flip matches`txt;
+  matches:value select format,last txt by s from updMatches;
+  format:tm.i.formatYMD/[matches`format];
+  format:tm.i.resolveFormat raze@[format;where 1<count each format;:;" "];  
+  2#"D"$"."sv tm.i[`parseYear`parseMonth`parseDay]@'matches[`txt]idesc format
+  }
+
+// @private
+// @kind function
+// @category nlpTimeUtility
+// @desc Seperate YearMonth formats to year and month
+//   i.e "ymd" -> "y","m","d"
+// @params ymd {string[]} The format for each date objecct
+// @returns {string} Formats of YearMonthDays objects seperated
+tm.i.formatYMD:{[ymd]
+  @[ymd;i unq;:;"ymd" unq:where 1=count each i:where each "ymd" in/:\:ymd]
+  }
+
+// @private
+// @kind function
+// @category nlpTimeUtility
+// @desc Fill in the blanks in a date format string
+// @param format {string} A date format, as some permutation of 
+//   "d", "m", and "y"
+// @returns {string} The date format with any blanks filled with their most
+//   plausible value
+tm.i.resolveFormat:{[format]
+  $[0=n:sum" "=format;
+      ;
+    1=n;
+      ssr[;" ";first"ymd"except format];
+    2=n;
+      tm.i.dateFormats;
+    {"dmy"}
+   ]format
+  }
+
+// @private
+// @kind dictionary
+// @category nlpTimeUtility
+// @desc The format to use, given a single known position
+// @type dictionary
+tm.i.dateFormats:(!). flip(
+  ("d  ";"dmy"); // 10th 02 99
+  ("m  ";"mdy"); // Feb 10 99
+  ("y  ";"ymd"); // 1999 02 10
+  (" d ";"mdy"); // 02 10th 99
+  (" m ";"dmy"); // 10 Feb 99
+  (" y ";"dym"); // 10 1999 02 This is never conventionally used
+  ("  d";"ymd"); // 99 02 10th
+  ("  m";"ydm"); // 99 10 Feb This is never conventionally used
+  ("  y";"dmy")) // 10 02 1999 //mdy is the american option
+
+// @private
+// @kind function
+// @category nlpTimeUtility
+// @desc Turns a regex time string into a q timestamp
+//   i.e "131030" -> 13:10:30.000
+//       "1pm"    -> 13:00:00.000
+// @param text {string} A time string
+// @returns {timestamp} The q time parsed from an
+//   appropriate string
+tm.i.parseTime:{[text]
+  numText:vs[" ";text][0]in"1234567890:.";
+  time:"T"$text where numText; 
+  amPM:regex.i.check[;text]each regex.objects`am`pm;
+  time+$[amPM[0]&12=`hh$time;-1;amPM[1]&12>`hh$time;1;0]*12:00
+  }
+
+// @private
+// @kind function
+// @category nlpTimeUtility
+// @desc Remove any null values
+// @array {number[][]} Array of values
+// returns {number[][]} Array with nulls removed
+tm.i.rmNull:{[array]
+  array where not null array[;0]
+  }
diff --git a/code/date_time.q b/code/date_time.q
deleted file mode 100644
index 9bfa945..0000000
--- a/code/date_time.q
+++ /dev/null
@@ -1,65 +0,0 @@
-\d .nlp
-
-// Pad day string to 2 digits
-tm.i.parseDay:{-2#"0",x where x in .Q.n}
-
-// Convert month string and pad to 2 digits
-tm.i.months:`jan`feb`mar`apr`may`jun`jul`aug`sep`oct`nov`dec!`$string 1+til 12
-tm.i.parseMonth:{-2#"0",string x^tm.i.months x:lower`$3 sublist x}
-
-// Pad year string to 4 digits (>35 deemed 1900s)
-tm.i.parseYear:{-4#$[35<"I"$-2#x;"19";"20"],x}
-
-// Convert year string to date range
-tm.i.convY:{"D"$x,/:(".01.01";".12.31")}
-
-// Convert yearmonth string to date range
-tm.i.convYM:{
-  matches:ungroup([fmt:"ym"]txt:regex.matchAll[;x]each regex.objects`year`month);
-  matches:value select fmt,last txt by s from matches,'flip`txt`s`e!flip matches`txt;
-  fmt:{@[x;where not xx;except[;raze x where xx:1=count each x]]}/[matches`fmt];
-  fmt:raze@[fmt;i where 1<count each i:group fmt;:;" "];
-  0 -1+"d"$0 1+"M"$"."sv tm.i[`parseYear`parseMonth]@'matches[`txt]idesc fmt}
-
-// Convert yearmonthday string to date range
-tm.i.convYMD:{
-  matches:ungroup([fmt:"ymd"]txt:regex.matchAll[;x]each regex.objects`year`month`day);
-  matches:value select fmt,last txt by s from matches,'flip`txt`s`e!flip matches`txt;
-  fmt:{@[x;i unq;:;"ymd" unq:where 1=count each i:where each "ymd" in/:\:x]}/[matches`fmt];
-  fmt:tm.i.resolveFormat raze@[fmt;where 1<count each fmt;:;" "];  
-  2#"D"$"."sv tm.i[`parseYear`parseMonth`parseDay]@'matches[`txt]idesc fmt}
-
-// Fill in blanks in date format string
-tm.i.resolveFormat:{$[0=n:sum" "=x;;1=n;ssr[;" ";first"ymd"except x];2=n;tm.i.dateFormats;{"dmy"}]x}
-
-tm.i.dateFormats:(!). flip( / fmt given single known position
-  ("d  ";"dmy"); // 2nd 12 12
-  ("m  ";"mdy"); // Jan 12 12
-  ("y  ";"ymd"); // 1999 12 12
-  (" d ";"mdy"); // 12 2nd 12
-  (" m ";"dmy"); // 12 Jan 12
-  (" y ";"dym"); // 12 1999 12 This is never conventionally used
-  ("  d";"ymd"); // 12 12 2nd
-  ("  m";"ydm"); // 12 12 Jan This is never conventionally used
-  ("  y";"dmy")) // 12 12 1999 //mdy is the american option
-
-// Turns string matching time regex into a q time
-tm.i.parseTime:{
-  tm:"T"$x where vs[" ";x][0]in"1234567890:.";
-  ampm:regex.check[;x]each regex.objects`am`pm;
-  tm+$[ampm[0]&12=`hh$tm;-1;ampm[1]&12>`hh$tm;1;0]*12:00}
-
-
-// Find all times : list of 4-tuples (time; timeText; startIndex; 1+endIndex)
-tm.findTimes:{time:(tm.i.parseTime each tmtxt[;0]),'tmtxt:regex.matchAll[regex.objects.time;x]; time where time[;0]<24:01}
-
-// Find all dates : list of 5-tuples (startDate; endDate; dateText; startIndex; 1+endIndex)
-tm.findDates:{[text]
-  rmInv:{x where not null x[;0]};
-  ym:regex.matchAll[regex.objects.yearmonth;text];
-  ymd:regex.matchAll[regex.objects.yearmonthday;text];
-  dts:rmInv(tm.i.convYMD each ymd[;0]),'ymd;
-  if[count dts;ym@:where not any ym[;1] within/: dts[; 3 4]];
-  dts,:rmInv(tm.i.convYM each ym[;0]),'ym;
-  dts iasc dts[;3]}
-
diff --git a/code/email.q b/code/email.q
index 490e4e9..79945d0 100644
--- a/code/email.q
+++ b/code/email.q
@@ -1,59 +1,224 @@
-\d .nlp
+// code/email.q - Nlp email utilities
+// Copyright (c) 2021 Kx Systems Inc
+//
+// Utilities for handling emails
 
-//Loading python script to extract rtf text
-system"l ",.nlp.path,"/","code/extract_rtf.p";
-i.striprtf:.p.get[`striprtf;<]
+\d .nlp
 
-// Read mbox file, convert to table, parse metadata & content
-email.getMboxText:{[fp]update text:.nlp.email.i.extractText each payload from email.i.parseMbox fp}
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Rich Text Format (RTF) parsing function imported from python
+email.i.striprtf:.p.get[`striprtf;<]
 
-email.i.findmime:{all(99=type each y`payload;x~/:y`contentType;0b~'y[`payload]@'`attachment)}
-email.i.html2text:{email.i.bs[x;"html.parser"][`:get_text;"\\n"]`} / extract text from html
-email.i.extractText:{
- / string is actual text, bytes attachment or non text mime type like inline image, dict look at content element
- $[10=type x;x;4=type x;"";99=type x;.z.s x`content;
-   count i:where email.i.findmime["text/plain"]x;"\n\n"sv{x[y][`payload]`content}[x]each i;
-   / use beautiful soup to extract text from html
-   count i:where email.i.findmime["text/html"]x ;"\n\n"sv{email.i.html2text x[y][`payload]`content}[x]each i;
-   / use python script to extract text from rtf
-   count i:where email.i.findmime["application/rtf"]x ;"\n\n"sv{i.striprtf x[y][`payload]`content}[x]each i;
-   "\n\n"sv .z.s each x`payload]}
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Extract information from various message text types
+// @params textTyp {string} The format of the message text 
+// @param msg {string|dictionary} An email message, or email subtree
+// @returns {boolean} Whether or not msg fits the text type criteria 
+email.i.findMime:{[textTyp;msg]
+  msgDict:99=type each msg`payload;
+  contentTyp:textTyp~/:msg`contentType;
+  attachment:0b~'msg[`payload]@'`attachment;
+  all(msgDict;contentTyp;attachment)
+  }
 
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Use beautiful soup to extract text from a html file
+// @param msg {string} The message payload
+// @returns {string} The text from the html
+email.i.html2text:{[msg]
+  email.i.bs[msg;"html.parser"][`:get_text;"\\n"]`
+  }
 
-// Graph of who emailed whom, inc number of mails
-email.getGraph:{[msgs]
-  0!`volume xdesc select volume:count i by sender,to from flip`sender`to!flip`$raze email.i.getToFrom each msgs}
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Given an email, extract the text of the email
+// @param msg {string|dictionary} An email message, or email subtree
+// @returns {string} The text of the email, or email subtree
+email.i.extractText:{[msg]
+  // String is actual text, bytes attachment or non text mime type like inline 
+  // image, dict look at content element
+  msgType:type msg;
+  if[10=msgType;:msg];
+  if[4=msgType;:""];
+  if[99=msgType;:.z.s msg`content];
+  findMime:email.i.findMime[;msg];
+  text:$[count i:where findMime["text/plain"];
+      {x[y][`payload]`content}[msg]each i;
+    count i:where findMime["text/html"];
+      {email.i.html2text x[y][`payload]`content}[msg]each i;
+    count i:where findMime["application/rtf"];
+      // Use python script to extract text from rtf
+      {email.i.striprtf x[y][`payload]`content}[msg]each i;
+    .z.s each msg`payload
+    ];
+  "\n\n"sv text
+  }
 
-// Get to/from pairs from an email
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Get all the to/from pairs from an email
+// @param msg {dictionary} An email message, or subtree thereof
+// @returns {any[]} To/from pairings of an email
 email.i.getToFrom:{[msg]
-  ((msg[`sender;0;1];)each msg[`to;;1]),$[98=type p:msg`payload;raze .z.s each p;()]}
+  payload:msg`payload;
+  payload:$[98=type payload;raze .z.s each payload;()];
+  edges:(msg[`sender;0;1];)each msg[`to;;1];
+  edges,payload
+  }
+
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Extract the sender information from an email
+// @param emails {<} The email as an embedPy object
+// @returns {string[]} Sender name and email
+email.i.getSender:{[emails]
+  fromInfo:raze emails[`:get_all;<]each("from";"resent-from");
+  email.i.getAddr fromInfo where not(::)~'fromInfo
+  }
+
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Extract the receiver information from an email
+// @param emails {<} The email as an embedPy object
+// @returns {string[]} Reciever name and email
+email.i.getTo:{[emails]
+  toInfo:raze emails[`:get_all;<]each("to";"cc";"resent-to";"resent-cc");
+  email.i.getAddr toInfo where not any(::;"")~/:\:toInfo
+  }
+
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Extract the date information from an email
+// @param emails {<} The email as an embedPy object
+// @returns {timestamp} Date email was sent
+email.i.getDate:{[emails]
+  dates:string 6#email.i.parseDate emails[@;`date];
+  "P"$"D"sv".:"sv'3 cut{$[1=count x;"0";""],x}each dates
+  }
+ 
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Extract the subject information from an email
+// @param emails {<} The email as an embedPy object
+// @returns {string} Subject of the email
+email.i.getSubject:{[emails]
+  subject:emails[@;`subject];
+  $[(::)~subject`;
+    "";
+    email.i.makeHdr[email.i.decodeHdr subject][`:__str__][]`
+    ]
+  }
+
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Extract the content type of an email
+// @param emails {<} The email as an embedPy object
+// @returns {string} Content type of an email 
+email.i.getContentType:{[emails]
+  emails[`:get_content_type][]`
+  }
 
-// Init python and q functions for reading mbox files
-email.i.parseMail:{email.i.parseMbox1 email.i.msgFromString[x]`.}
-email.i.parseMbox:{email.i.parseMbox1 each .p.list[<] .p.import[`mailbox;`:mbox]x}
-email.i.parseMbox1:{k!email.get.i[k:`sender`to`date`subject`contentType`payload]@\:.p.wrap x}
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Extract the payload information from an email
+// @param emails {<} The email as an embedPy object
+// @returns {dictionary|table} Dictionary of `attachment`content or a table 
+//   of payloads
+//   Content is byte[] for binary data, char[] for text
+email.i.getPayload:{[emails]
+  if[emails[`:is_multipart][]`;
+    :email.i.parseMbox1 each emails[`:get_payload][]`
+    ];
+  // Raw bytes decoded from base64 encoding, wrapped embedPy
+  raw:emails[`:get_payload;`decode pykw 1]; 
+  rtf:"application/rtf"~email.i.getContentType emails;
+  attachment:"attachment"~emails[`:get_content_disposition][]`;
+  payload:`attachment`content!(0b;raw`);
+  if[all(rtf;attachment);:payload];
+  if[attachment;
+    payload,`attachment`filename!(1b;email[`:get_filename][]`);
+    ];
+  content:email.i.getContentType emails;
+  if[not any content~/:("text/html";"text/plain";"message/rfc822");:payload];
+  charset:emails[`:get_content_charset][]`;
+  content:i.str[raw;$[(::)~charset;"us-ascii";charset];"ignore"]`;
+  `attachment`content!(0b;content)
+  }
 
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Extract meta information from an email 
+// @params filepath {string} The path to the mbox
+// @returns {dictionary} Meta information from the email
+email.i.parseMbox:{[filepath]
+  mbox:email.i.mbox filepath;
+  email.i.parseMbox1 each .p.list[<] mbox
+  }
+
+// @private
+// @kind function
+// @category nlpEmailUtility
+// @desc Extract meta information from an email 
+// @params mbox {<} Emails in mbox format
+// @returns {dictionary} Meta information from the email
+email.i.parseMbox1:{[mbox]
+  columns:`sender`to`date`subject`contentType`payload;
+  msgInfo:`getSender`getTo`getDate`getSubject`getContentType`getPayload;
+  columns!email.i[msgInfo]@\:.p.wrap mbox
+  }
+
+// Python imports
 email.i.bs:.p.import[`bs4]`:BeautifulSoup
-email.i.getaddr:.p.import[`email.utils;`:getaddresses;<]
-email.i.parsedate:.p.import[`email.utils;`:parsedate;<]
-email.i.decodehdr:.p.import[`email.header;`:decode_header]
-email.i.makehdr:.p.import[`email.header;`:make_header]
+email.i.getAddr:.p.import[`email.utils;`:getaddresses;<]
+email.i.parseDate:.p.import[`email.utils;`:parsedate;<]
+email.i.decodeHdr:.p.import[`email.header;`:decode_header]
+email.i.makeHdr:.p.import[`email.header;`:make_header]
 email.i.msgFromString:.p.import[`email]`:message_from_string
+email.i.mbox:.p.import[`mailbox]`:mbox
+
+
+// @kind function
+// @category nlpEmail
+// @desc Convert an mbox file to a table of parsed metadata
+// @param filepath {string} The path to the mbox file
+// @returns {table} Parsed metadata and content of the mbox file
+email.loadEmails:{[filepath]
+  parseMbox:email.i.parseMbox filepath;
+  update text:.nlp.email.i.extractText each payload from parseMbox
+  }
+
+// @kind function
+// @category nlpEmail
+// @desc Get the graph of who emailed who, including the number of
+//   times they emailed
+// @param emails {table} The result of .nlp.loadEmails
+// @returns {table} Defines to-from pairings of emails
+email.getGraph:{[emails]
+  getToFrom:flip`$raze email.i.getToFrom each emails;
+  getToFromTab:flip`sender`to!getToFrom;
+  0!`volume xdesc select volume:count i by sender,to from getToFromTab
+  }
 
-email.get.i.sender:{email.i.getaddr e where not(::)~'e:raze x[`:get_all;<]each("from";"resent-from")}
-email.get.i.to:{email.i.getaddr e where not any(::;"")~/:\:e:raze x[`:get_all;<]each("to";"cc";"resent-to";"resent-cc")}
-email.get.i.date:{"P"$"D"sv".:"sv'3 cut{$[1=count x;"0";""],x}each string 6#email.i.parsedate x[@;`date]}
-email.get.i.subject:{$[(::)~(s:x[@;`subject])`;"";email.i.makehdr[email.i.decodehdr s][`:__str__][]`]}
-email.get.i.contentType:{x[`:get_content_type][]`}
-/ return a dict of `attachment`content or a table of payloads, content is byte[] for binary data, char[] for text
-email.get.i.payload:{
- if[x[`:is_multipart][]`;:email.i.parseMbox1 each x[`:get_payload][]`];
- raw:x[`:get_payload;`decode pykw 1]; / raw bytes decoded from base64 encoding, wrapped embedPy
- if[all("application/rtf"~(x[`:get_content_type][]`);"attachment"~x[`:get_content_disposition][]`);:`attachment`content!(0b;raw`)];
- if["attachment"~x[`:get_content_disposition][]`;:`attachment`content`filename!(1b;raw`;x[`:get_filename][]`)];
- /if text is in rtf, mbox treats it as an attachment
- /if[all("application/rtf"~(x[`:get_content_type][]`);"attachment"~x[`:get_content_dispositon][]`);:`attachment`content!(0b;raw`)];
- / e.g. inline images, return raw bytes in payload
- if[not any(ct:x[`:get_content_type][]`)~/:("text/html";"text/plain";"message/rfc822");:`attachment`content!(0b;raw`)];
- :`attachment`content!(0b;i.str[raw;$[(::)~s:x[`:get_content_charset][]`;"us-ascii";s];"ignore"]`)
- }
+// @kind function
+// @category nlpEmailUtility
+// @desc Extract meta information from an email
+// @params filepath {string} The path to where the email is stored
+// @returns {dictionary} Meta information from the email
+email.parseMail:{[filepath]
+  email.i.parseMbox1 email.i.msgFromString[filepath]`.
+  }
diff --git a/code/extract_rtf.p b/code/extractRtf.p
similarity index 99%
rename from code/extract_rtf.p
rename to code/extractRtf.p
index e0af14a..d154de2 100644
--- a/code/extract_rtf.p
+++ b/code/extractRtf.p
@@ -5,7 +5,7 @@ import re
 
 def striprtf(text):
    pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
-   # control words which specify a "destionation".
+   # control words which specify a "destination".
    destinations = frozenset((
       'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
       'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
diff --git a/code/nlpCode.q b/code/nlpCode.q
new file mode 100644
index 0000000..4806d5d
--- /dev/null
+++ b/code/nlpCode.q
@@ -0,0 +1,437 @@
+// code/nlpCode.q - NLP code
+// Copyright (c) 2021 Kx Systems Inc
+//
+// Main NLP code base
+
+\d .nlp
+
+// Date-Time
+
+// @kind function
+// @category nlp
+// @desc Find any times in a string
+// @param text {string} A text, potentially containing many times
+// @returns {any[]} A list of tuples for each time containing
+//   (q-time; timeText; startIndex; 1+endIndex)
+findTimes:{[text]
+  timeText:regex.matchAll[regex.objects.time;text];
+  parseTime:tm.i.parseTime each timeText[;0];
+  time:parseTime,'timeText;
+  time where time[;0]<24:01
+  }
+
+// @kind function
+// @category nlp
+// @desc Find all the dates in a document
+// @param text {string} A text, potentially containing many dates
+// @returns {any[]} A list of tuples for each time containing 
+//   (startDate; endDate; dateText; startIndex; 1+endIndex)
+findDates:{[text]
+  ym:regex.matchAll[regex.objects.yearMonth;text];
+  ymd:regex.matchAll[regex.objects.yearMonthDay;text];
+  convYMD:tm.i.convYearMonthDay each ymd[;0];
+  dates:tm.i.rmNull convYMD,'ymd;
+  if[count dates;ym@:where not any ym[;1] within/: dates[; 3 4]];
+  convYM:tm.i.convYearMonth each ym[;0];
+  dates,:tm.i.rmNull convYM,'ym;
+  dates iasc dates[;3]
+  }
+
+// Parsing function
+
+// @kind function
+// @category nlp
+// @desc Parse URLs into dictionaries containing the
+//   constituent components
+// @param url {string} The URL to decompose into its components
+// @returns {dictionary} Contains information about the scheme, domain name 
+//   and other URL information
+parseURLs:{[url]
+  urlKeys:`scheme`domainName`path`parameters`query`fragment;
+  urlVals:parser.i.parseURLs url;
+  urlKeys!urlVals
+  }
+
+// @kind function
+// @category nlp
+// @desc Create a new parser
+// @param spacyModel {symbol} The spaCy model/language to use. 
+//   This must already be installed.
+// @param fieldNames {symbol[]} The fields the parser should return
+// @returns {fn} A function to parse text
+newParser:{[spacyModel;fieldNames]
+  options:{distinct x,raze parser.i.depOpts x}/[fieldNames];
+  disabled:`ner`tagger`parser except options;
+  model:parser.i.newSubParser[spacyModel;options;disabled];
+  tokenAttrs:parser.i.q2spacy key[parser.i.q2spacy]inter options;
+  pyParser:parser.i.parseText[model;tokenAttrs;options;];
+  stopWords:(`$.p.list[model`:Defaults.stop_words]`),`$"-PRON-";
+  parser.i.runParser[pyParser;fieldNames;options;stopWords]
+  }
+
+// Sentiment
+
+// @kind function
+// @category nlp
+// @desc Calculate the sentiment of a sentence or short message, 
+//   such as a tweet
+// @param text {string} The text to score
+// @returns {dictionary} The score split up into compound, positive, negative 
+//   and neutral components
+sentiment:{[text]
+  valences:sent.i.lexicon tokens:lower rawTokens:sent.i.tokenize text;
+  isUpperCase:(rawTokens=upper rawTokens)& rawTokens<>tokens;
+  upperIndices:where isUpperCase & not all isUpperCase;
+  valences[upperIndices]+:sent.i.ALLCAPS_INCR*signum valences upperIndices;
+  valences:sent.i.applyBoosters[tokens;isUpperCase;valences];
+  valences:sent.i.negationCheck[tokens;valences];
+  valences:sent.i.butCheck[tokens;valences];
+  sent.i.scoreValence[0f^valences;text]
+  }
+
+// Comparing docs/terms
+
+// @kind function
+// @category nlp
+// @desc Calculates the affinity between terms in two corpus' using
+//   an Algorithm from Rayson, Paul and Roger Garside.
+//   "Comparing corpora using frequency profiling."
+//   Proceedings of the workshop on Comparing Corpora. Association for 
+//   Computational Linguistics, 2000
+// @param parsedTab1 {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param parsedTab2 {table} A parsed document containing keywords and their
+//   associated significance scores
+// @returns {dictionary[]} A dictionary of terms and their affinities for 
+//   parsedTab2 over parsedTab1
+compareCorpora:{[parsedTab1;parsedTab2]
+  if[not min count each (parsedTab1;parsedTab2);:((`$())!();(`$())!())];
+  termCountA:i.getTermCount parsedTab1;
+  termCountB:i.getTermCount parsedTab2;
+  totalWordCountA:sum termCountA;
+  totalWordCountB:sum termCountB;
+  // The expected termCount of each term in each corpus
+  coef:(termCountA+termCountB)%(totalWordCountA+totalWordCountB);
+  expectedA:totalWordCountA*coef;
+  expectedB:totalWordCountB*coef;
+  // Return the differences between the corpora
+  dict1:desc termCountA*log termCountA%expectedA;
+  dict2:desc termCountB*log termCountB%expectedB;
+  (dict1;dict2)
+  }
+
+// @kind function
+// @category nlp
+// @desc Calculates the cosine similarity of two documents
+// @param keywords1 {dictionary} Keywords and their significance scores 
+// @param keywords2 {dictionary} Keywords and their significance scores 
+// @returns {float} The cosine similarity of two documents
+compareDocs:{[keyword1;keyword2]
+  keywords:distinct raze key each(keyword1;keyword2);
+  cosineSimilarity .(keyword1;keyword2)@\:keywords
+  }
+
+// @kind function
+// @category nlp
+// @desc A function for comparing the similarity of two vectors
+// @param keywords1 {dictionary} Keywords and their significance scores 
+// @param keywords2 {dictionary} Keywords and their significance scores 
+// @returns {float} Similarity score between -1f and 1f inclusive, 1 being
+//   perfectly similar, -1 being perfectly dissimilar
+cosineSimilarity:{[keywords1;keywords2]
+  sqrtSum1:sqrt sum keywords1*keywords1;
+  sqrtSum2:sqrt sum keywords2*keywords2;
+  sum[keywords1*keywords2]%(sqrtSum1)*sqrtSum2
+  }
+
+// @kind function
+// @category nlp
+// @desc Calculate how much each term contributes to the 
+//   cosine similarity
+// @param keywords1 {dictionary} Keywords and their significance scores 
+// @param keywords2 {dictionary} Keywords and their significance scores 
+// @returns {dictionary} A dictionary of how much of the similarity score each 
+//   token is responsible for
+explainSimilarity:{[keywords1;keywords2]
+  alignedKeys:inter[key keywords1;key keywords2];
+  keywords1@:alignedKeys;
+  keywords2@:alignedKeys;
+  product:(keywords2%i.magnitude keywords1)*(keywords2%i.magnitude keywords2);
+  desc alignedKeys!product%sum product
+  }
+
+// @kind function
+// @category nlp
+// @desc Calculates the cosine similarity of a document and a centroid,
+//   subtracting the document from the centroid.
+//   This does the subtraction after aligning the keys so that terms not in 
+//   the centroid don't get subtracted.
+//   This assumes that the centroid is the sum, not the avg, of the documents
+//   in the cluster
+// @param centroid {dictionary} The sum of all the keywords significance scores
+// @param keywords {dictionary} Keywords and their significance scores 
+// @returns {float} The cosine similarity of a document and centroid
+compareDocToCentroid:{[centroid;keywords]
+  keywords@:alignedKeys:distinct key[centroid],key keywords;
+  vec:centroid[alignedKeys]-keywords;
+  cosineSimilarity[keywords;vec]
+  }
+
+// @kind function
+// @category nlp
+// @desc Find the cosine similarity between one document and all the
+//   other documents of the corpus
+// @param keywords {dictionary} Keywords and their significance scores 
+// @param idx {number} The index of the feature vector to compare to the rest
+//   of the corpus
+// @returns {float[]} The document's significance to the rest of the corpus
+compareDocToCorpus:{[keywords;idx]
+  compareDocs[keywords idx]each(idx+1)_ keywords
+  }
+
+// @kind function
+// @category nlp
+// @desc Calculate the Jaro-Winkler distance of two strings,
+//   scored between 0 and 1
+// @param str1 {str|string[]} A string of text
+// @param str2 {string|string[]} A string of text
+// @returns {float} The Jaro-Winkler of two strings, between 0 and 1
+jaroWinkler:{[str1;str2]
+  str1:lower str1;
+  str2:lower str2;
+  jaroScore:i.jaro[str1;str2];
+  jaroScore+$[0.7<jaroScore;
+    (sum mins(4#str1)~'4#str2)*.1*1-jaroScore;
+    0
+    ]
+  }
+
+// Feature Vectors
+
+// @kind function
+// @category nlp
+// @desc Find related terms and their significance to a word
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param term {symbol} The tokens to find related terms for
+// @returns {dictionary} The related tokens and their relevances
+findRelatedTerms:{[parsedTab;term]
+  term:lower term;
+  stopWords:where each parsedTab`isStop;
+  sent:raze parsedTab[`sentIndices]cut'@'[parsedTab[`tokens];stopWords;:;`];
+  sent@:asc distinct raze 0|-1 0 1+\:where term in/:sent;
+  // The number of sentences the term co-occurs in
+  coOccur:` _ count each group raze distinct each sent;
+  idx:where each parsedTab[`tokens]in\:key coOccur;
+  // Find how many sentences each word occurs in
+  totOccur:idx@'group each parsedTab[`tokens]@'idx;
+  sentInd:parsedTab[`sentIndices]bin'totOccur;
+  totOccur:i.fastSum((count distinct@)each)each sentInd;
+  coOccur%:totOccur term;
+  totOccur%:sum count each parsedTab`sentIndices;
+  results:(coOccur-totOccur)%sqrt totOccur*1-totOccur;
+  desc except[where results>0;term]#results
+  }
+
+// @kind function
+// @category nlp
+// @desc Find tokens that contain the term where each consecutive word
+//   has an above-average co-occurrence with the term
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param term {symbol} The term to extract phrases around
+// @returns {dictionary} Phrases as the keys, and their relevance as the values
+extractPhrases:{[parsedTab;term]
+  term:lower term;
+  tokens:parsedTab`tokens;
+  related:findRelatedTerms[parsedTab]term;
+  // This gets the top words that have an above average relavance to the 
+  // query term
+  relevant:term,sublist[150]where 0<related;
+  // Find all of the term's indices in the corpus
+  runs:(i.findRuns where@)each tokens in\:relevant;
+  tokenRuns:raze tokens@'runs;
+  phrases:count each group tokenRuns where term in/:tokenRuns;
+  desc(where phrases>1)#phrases
+  }
+
+// @kind function
+// @category nlp
+// @desc Given an input which is conceptually a single document,
+//   such as a book, this will give better results than TF-IDF.
+//   This algorithm is explained in the paper Carpena, P., et al.
+//   "Level statistics of words: Finding keywords in literary texts
+//    and symbolic sequences."
+//   Physical Review E 79.3 (2009): 035102.
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @returns {dictionary} Where the keys are keywords as symbols, and the values
+//   are their significance, as floats,with higher values being more 
+//   significant
+keywordsContinuous:{[parsedTab]
+  text:raze parsedTab[`tokens]@'where each not parsedTab`isStop;
+  groupTxt:group text;
+  n:count each groupTxt;
+  // Find the distinct words, ignoring stop words and those with 3 or fewer 
+  // occurences, or make up less than .002% of the corpus
+  words:where n>=4|.00002*count text;
+  // Find the distances between occurences of the same word
+  // and use this to generate a 'sigma value' for each word
+  dist:deltas each words#groupTxt;
+  n:words#n;
+  sigma:(dev each dist)%(avg each dist)*sqrt 1-n%count text;
+  stdSigma:1%sqrt[n]*1+2.8*n xexp -0.865;
+  chevSigma:((2*n)-1)%2*n+1;
+  desc(sigma-chevSigma)%stdSigma
+  }
+
+// @kind function
+// @category nlp
+// @desc Find the TF-IDF scores for all terms in all documents
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @returns {dictionary[]} For each document, a dictionary with the tokens as 
+//   keys, and relevance as values
+TFIDF:{[parsedTab]
+  nums:parsedTab[`tokens]like\:"[0-9]*";
+  tokens:parsedTab[`tokens]@'where each not parsedTab[`isStop]|nums;
+  words:distinct each tokens;
+  // The term frequency of each token within the document
+  TF:{x!{sum[x in y]%count x}[y]each x}'[words;tokens];
+  // Calculate the inverse document frequency
+  IDF:1+log count[tokens]%{sum{x in y}[y]each x}[tokens]each words;
+  TF*IDF
+  }
+
+// Exploratory Analysis 
+
+// @kind function
+// @category nlp
+// @desc Find runs of tokens whose POS tags are in the set passed in
+// @param tagType {symbol} `uniPOS or `pennPOS (Universal or Penn 
+//   Part-of-Speech)
+// @param tags {symbol|symbol[]} One or more POS tags
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @returns {list} Two item list containing
+//   1. The text of the run as a symbol vector
+//   2. The index associated with the first token
+findPOSRuns:{[tagType;tags;parsedTab]
+  matchingTag:parsedTab[tagType]in tags;
+  start:where 1=deltas matchingTag;
+  lengths:sum each start cut matchingTag;
+  idx:start+til each lengths; 
+  runs:`$" "sv/:string each parsedTab[`tokens]start+til each lengths;
+  flip(runs;idx)
+  }
+
+// @kind function
+// @category nlp
+// @desc Determine the probability of one word following another
+//   in a sequence of words
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @returns {dictionary} The probability that the secondary word in the 
+//   sequence follows the primary word.
+biGram:{[parsedTab]
+  nums:parsedTab[`tokens]like\:"[0-9]*";
+  tokens:raze parsedTab[`tokens]@'where each not parsedTab[`isStop]|nums;
+  occurance:(distinct tokens)!{count where y=x}[tokens]each distinct tokens;
+  raze i.biGram[tokens;occurance]''[tokens;next tokens]
+  }
+
+// @kind function
+// @category nlp
+// @desc Determine the probability of a `n` tokens appearing together
+//   in a text
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @param n {long} The number of words to occur together
+// @returns {dictionary} The probability of `n` tokens appearing together in 
+//   a text
+nGram:{[parsedTab;n]
+  nums:parsedTab[`tokens]like\:"[0-9]*";
+  tokens:raze parsedTab[`tokens]@'where each not parsedTab[`isStop]|nums;
+  tab:rotate\:[til n]tokens;
+  nGroup:last[tab]group neg[n-1]_flip(n-1)#tab;
+  occurance:{(count each group x)%count x}each nGroup;
+  returnKeys:raze key[occurance],/:'{key x}each value occurance;
+  returnVals:raze value each value occurance;
+  returnKeys!returnVals
+  }
+
+// Util 
+
+// @kind function
+// @category nlp
+// @desc Find Regular expressions within texts
+// @param text {string[]} The text of a document
+// @param expr {symbol} The expression type to be searched for within the text
+findRegex:{[text;expr]
+  nExpr:$[1=count expr;enlist;];
+  regexKeys:nExpr expr;
+  regexVals:nExpr{regex.matchAll[regex.objects[x];y]}[;text]each expr;
+  regexKeys!regexVals
+  }
+
+// @kind function
+// @category nlp
+// @desc Remove any non-ascii characters from a text
+// @param text {string} A string of text
+// @returns {string} Non-ascii characters removed from the text
+removeNonAscii:{[text]
+  text where text within (0;127)
+  }
+
+// @kind function
+// @category nlp
+// @desc Remove certain characters from a string of text
+// @param text {string} A string of text
+// @param char {string[]} Characters or expressions to be removed from the text 
+// @returns {string} The text without anything that contains the defined 
+//   characters
+removeCustom:{[text;char]
+  vecText:" " vs text;
+  rtrim raze(vecText where{not(max ,'/)x like/:y}[;char]each vecText),'" "
+  }
+
+// @kind function
+// @category nlp
+// @desc Remove and replace certain characters from a string of text
+// @param text {string} A string of text
+// @param char {string[]} Characters or expressions to be removed from the text 
+// @param replace {string} The characters which will replace the removed
+//   characters
+removeReplace:{[text;char;replace]
+  {x:ssr[x;y;z];x}[;;replace]/[text;char]
+  }
+
+// @kind function
+// @category nlp
+// @desc Detect language from text
+// @param text {string} A string of text
+// @returns {symbol} The language of the text
+detectLang:{[text]
+  `$.p.import[`langdetect][`:detect;<][text]
+  }
+
+// @kind function
+// @category nlp
+// @desc Import all files in a directory recursively
+// @param filepath {string} The directories file path
+// @returns {table} Filenames, paths and texts contained within the filepath
+loadTextFromDir:{[filepath]
+  path:{raze$[-11=type k:key fp:hsym x;fp;.z.s each` sv'fp,'k]}`$filepath;
+  ([]fileName:(` vs'path)[;1];path;text:"\n"sv'read0 each path)
+  }
+
+// @kind function
+// @category nlp
+// @desc Get all the sentences for a document
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @returns {string[]} All the sentences from a document
+getSentences:{[parsedTab]
+  (sublist[;parsedTab`text]deltas@)each parsedTab`sentChars
+  }
+
diff --git a/code/nlp_code.q b/code/nlp_code.q
deleted file mode 100644
index 28f2e83..0000000
--- a/code/nlp_code.q
+++ /dev/null
@@ -1,163 +0,0 @@
-\d .nlp
-
-// Date-Time
-
-// Find all dates : list of 5-tuples (startDate; endDate; dateText; startIndex; 1+endIndex)
-findDates:tm.findDates
-
-// Find all times : list of 4-tuples (time; timeText; startIndex; 1+endIndex)
-findTimes:tm.findTimes
-
-// Email
-
-// Read mbox file, convert to table, parse metadata & content
-email.loadEmails:loadEmails:email.getMboxText
-
-// Graph of who emailed whom, inc number of mails
-email.getGraph:{[msgs]
-  0!`volume xdesc select volume:count i by sender,to from flip`sender`to!flip`$raze email.i.getToFrom each msgs}
-
-email.parseMail:email.i.parseMail
-
-// Sentiment
-
-// Calculate sentiment of sentence of short message
-sentiment:sent.score
-
-// Comparing docs/terms
-
-// Give 2 dicts of each term's affinity to each corpus
-// Algorithm from Rayson, Paul, and Roger Garside. "Comparing corpora using frequency profiling."
-// Proceedings of the workshop on Comparing Corpora. Association for Computational Linguistics, 2000
-compareCorpora:{[corp1;corp2]
-  if[(not count corp1)|(not count corp2);:((`$())!();(`$())!())];
-  getTermCount:{[corp]
-    i.fastSum{1+log count each group x}each corp[`tokens]@'where each not corp`isStop};
-  totalWordCountA:sum termCountA:getTermCount corp1;
-  totalWordCountB:sum termCountB:getTermCount corp2;
-  // The expected termCount of each term in each corpus
-  coef:(termCountA+termCountB)%(totalWordCountA+totalWordCountB);
-  expectedA:totalWordCountA*coef;
-  expectedB:totalWordCountB*coef;
-  // Return the differences between the corpora
-  (desc termCountA*log termCountA%expectedA;desc termCountB*log termCountB%expectedB)}
-
-// Calc cosine similarity of two docs
-compareDocs:{cosineSimilarity .(x;y)@\:distinct raze key each(x;y)}
-
-// Compare similarity of 2 vectors
-cosineSimilarity:{sum[x*y]%(sqrt sum x*x)*sqrt sum y*y}
-
-// How much each term contributes to the cosine similarity
-explainSimilarity:{[doc1;doc2]
-  alignedKeys:inter[key doc1;key doc2];
-  doc1@:alignedKeys;
-  doc2@:alignedKeys;
-  product:(doc2%i.magnitude doc1)*(doc2%i.magnitude doc2);
-  desc alignedKeys!product%sum product}
-
-// Cosine similarity of doc and centroid
-compareDocToCentroid:{[centroid;doc]
-  doc@:alignedKeys:distinct key[centroid],key doc;
-  cosineSimilarity[doc;centroid[alignedKeys]-doc]}
-
-// Calc cosine similarity between doc and entire corpus
-compareDocToCorpus:i.compareDocToCorpus
-
-// Jaro-Winkler distance between 2 strings
-jaroWinkler:{i.jaroWinkler[lower x;lower y]}
-
-// Feature Vectors
-
-// Generate feature vector (of stemmed tokens) for a term
-findRelatedTerms:{[docs;term]
-  sent:raze docs[`sentIndices]cut'@'[docs[`tokens];where each docs`isStop;:;`];
-  sent@:asc distinct raze 0|-1 0 1+\:where(term:lower term)in/:sent;
-  ccur:` _ count each group raze distinct each sent;
-  tcur:idx@'group each docs[`tokens]@'idx:where each docs[`tokens]in\:key ccur;
-  tcur:i.fastSum((count distinct@)each)each docs[`sentIndices]bin'tcur;
-  ccur%:tcur term;
-  tcur%:sum count each docs`sentIndices;
-  desc except[where r>0;term]#r:(ccur-tcur)%sqrt tcur*1-tcur}
-
-// Find runs containing term where each word has above average co-ocurrance with term
-extractPhrases:{[corpus;term]
-  relevant:term,sublist[150]where 0<findRelatedTerms[corpus]term:lower term;
-  runs:(i.findRuns where@)each(tokens:corpus`tokens)in\:relevant;
-  desc(where r>1)#r:count each group r where term in/:r:raze tokens@'runs}
-
-// On a conceptually single doc (e.g. novel), gives better results than TF-IDF
-// This algorithm is explained in the paper
-// Carpena, P., et al. "Level statistics of words: Finding keywords in literary texts and symbolic sequences."
-// Physical Review E 79.3 (2009): 035102.
-keywordsContinuous:{[docs]
-  n:count each gt:group text:raze docs[`tokens]@'where each not docs`isStop;
-  words:where n>=4|.00002*count text;
-  dist:deltas each words#gt;
-  sigma:(dev each dist)%(avg each dist)*sqrt 1-(n:words#n)%count text;
-  std_sigma:1%sqrt[n]*1+2.8*n xexp -0.865;
-  chev_sigma:((2*n)-1)%2*n+1;
-  desc(sigma-chev_sigma)%std_sigma}
-
-// Find TFIDF scores for all terms in all documents
-TFIDF:{[corpus]
-  tokens:corpus[`tokens]@'where each not corpus[`isStop]|corpus[`tokens]like\:"[0-9]*";
-  tab:{x!{sum[x in y]%count x}[y]each x}'[words:distinct each tokens;tokens];
-  tab*idf:1+log count[tokens]%{sum{x in y}[y]each x}[tokens]each words}
-
-TFIDF_tot:{[corpus]desc sum t%'sum each t:TFIDF corpus}
-
-// Parse Data
-
-// Create a new parser using a spaCy model (must already be installed)
-newParser:parser.newParser
-
-// Parse urls to dictionaries
-parseURLs:{`scheme`domainName`path`parameters`query`fragment!i.parseURLs x}
-
-// Exploratory Analysis 
-
-// Find runs of tokens whose POS tags are in the set passed in
-// Returns pair (text; firstIndex)
-findPOSRuns:{[tagType;tags;doc]
-  start:where 1=deltas matchingTag:doc[tagType]in tags;
-  ii:start+til each lengths:sum each start cut matchingTag;
-  runs:`$" "sv/:string each doc[`tokens]start+til each lengths;
-  flip(runs;ii)}
-
-// Currently only for 2-gram
-bi_gram:{[corpus]
- tokens:raze corpus[`tokens]@'where each not corpus[`isStop]|corpus[`tokens]like\:"[0-9]*";
- occ:(distinct tokens)!{count where y=x}[tokens]each distinct tokens;
- raze{[x;y;z;n](enlist(z;n))!enlist(count where n=x 1+where z=x)%y[z]}[tokens;occ]''[tokens;next tokens]}
-
-// Util 
-
-// Find Regular expressions within texts
-findRegex:{[text;expr]($[n;enlist;]expr)!$[n:1=count[expr];enlist;]{regex.matchAll[regex.objects[x];y]}[;text]each expr}
-
-// Remove any ascii characters from a text
-ascii:{x where x within (0;127)}
-
-// Remove certain characters from a string of text
-rmv_custom:{rtrim raze(l where{not(max ,'/)x like/:y}[;y]each l:" "vs x),'" "}
-
-// Remove and replace certain characters from a string of text
-rmv_main:{{x:ssr[x;y;z];x}[;;z]/[x;y]}
-
-// Detect language from text
-detectLang:{[text]`$.p.import[`langdetect][`:detect;<][text]}
-
-// Import all files in a dir recursively
-loadTextFromDir:{[fp]
-  path:{[fp]raze$[-11=type k:key fp:hsym fp;fp;.z.s each` sv'fp,'k]}`$fp;
-  ([]fileName:(` vs'path)[;1];path;text:"\n"sv'read0 each path)}
-
-// Get all sentences for a doc
-getSentences:i.getSentences
-
-// n-gram 
-ngram:{[corpus;n]
- tokens:raze corpus[`tokens]@'where each not corpus[`isStop]|corpus[`tokens]like\:"[0-9]*";
- raze[key[b],/:'{key x}each value b]!raze value each value b:{(count each group x)%count x
-  }each last[tab]group neg[n-1]_flip(n-1)#tab:rotate\:[til n]tokens}
diff --git a/code/parser.p b/code/parser.p
new file mode 100644
index 0000000..ae6a1c7
--- /dev/null
+++ b/code/parser.p
@@ -0,0 +1,40 @@
+## Python spell check function
+p)def spell(doc,model):
+  lst=[]
+  for s in doc:
+    if s._.hunspell_spell==False:
+      sug=s._.hunspell_suggest
+      if len(sug)>0:
+        ([lst.append(n)for n in model((sug)[0])]) 
+      else:
+        lst.append(s)
+    else:
+        lst.append(s)
+  return lst
+
+## Python function for running spacy
+p)def get_doc_info(parser,tokenAttrs,opts,text):
+  doc=doc1=parser(text)
+  if('spell' in opts):
+    doc1=spell(doc,parser)
+  res=[[getattr(w,a)for w in doc1]for a in tokenAttrs]
+  if('sentChars' in opts): # indices of first+last char per sentence
+    res.append([(s.start_char,s.end_char)for s in doc.sents])
+  if('sentIndices' in opts): # index of first token per sentence
+    res.append([s.start for s in doc.sents])
+  res.append([w.is_punct or w.is_bracket or w.is_space for w in doc])
+  return res
+
+## Python functions to detect sentence borders
+p)def x_sbd(doc):
+  if len(doc):
+    doc[0].is_sent_start=True
+    for i,token in enumerate(doc[:-1]):
+      doc[i+1].is_sent_start=token.text in ['。','？','！']
+  return doc
+
+## Python functionality for the generation of a url parser
+p)from urllib.parse import urlparse
+p)import re
+p)seReg=re.compile('([a-z0-9]+:)?//')
+
diff --git a/code/parser.q b/code/parser.q
index 56456f9..20aacf8 100644
--- a/code/parser.q
+++ b/code/parser.q
@@ -1,39 +1,38 @@
-\d .nlp
+// code/parser.q - Nlp parser utilities
+// Copyright (c) 2021 Kx Systems Inc
+//
+// Utilities for parsing 
 
-p)def spell(doc,model):
-  lst=[]
-  for s in doc:
-    if s._.hunspell_spell==False:
-      sug=s._.hunspell_suggest
-      if len(sug)>0:
-        ([lst.append(n)for n in model((sug)[0])]) 
-      else:lst.append(s)
-    else:
-        lst.append(s)
-  return lst
+\d .nlp
 
-// Python functions for running spacy
-p)def get_doc_info(parser,tokenAttrs,opts,text):
-  doc=doc1=parser(text)
-  if('spell' in opts):
-    doc1=spell(doc,parser)
-  res=[[getattr(w,a)for w in doc1]for a in tokenAttrs]
-  if('sentChars' in opts): # indices of first+last char per sentence
-    res.append([(s.start_char,s.end_char)for s in doc.sents])
-  if('sentIndices' in opts): # index of first token per sentence
-    res.append([s.start for s in doc.sents])
-  res.append([w.is_punct or w.is_bracket or w.is_space for w in doc])
-  return res
+// @private
+// @kind function
+// @category nlpParserUtility
+// @desc Retrieve python function for running spacy
 parser.i.parseText:.p.get[`get_doc_info;<];
-parser.i.cleanUTF8:.p.import[`builtins;`:bytes.decode;<][;`errors pykw`ignore]$["x"]@;
-p)def x_sbd(doc):
-  if len(doc):
-    doc[0].is_sent_start=True
-    for i,token in enumerate(doc[:-1]):
-      doc[i+1].is_sent_start=token.text in ['。','？','！']
-  return doc
 
-// Dependent options
+// @private
+// @kind function
+// @category nlpParserUtility
+// @desc Convert string input to an appropriate
+//   byte representation suitable for application in Python
+//   functions, this is particularly useful when dealing with
+//   languages other than English
+// @param data {string} Any input string containing any character
+//   arrays
+// @returns {string} The data parsed such that UTF-8 compliant
+//   characters can be appropriately managed by the NLP models
+parser.i.cleanUTF8:{[data]
+  byteDecode:.p.import[`builtins;`:bytes.decode;<];
+  // Convert data to bytes and decode to appropriate string
+  byteDecode["x"$data;`errors pykw`ignore]
+  }
+
+// @private
+// @kind dictionary
+// @category nlpParserUtility
+// @desc Dependent options for input to spacy module
+// @type dictionary
 parser.i.depOpts:(!). flip(
   (`keywords;   `tokens`isStop);
   (`sentChars;  `sentIndices);
@@ -43,7 +42,11 @@ parser.i.depOpts:(!). flip(
   (`lemmas;     `tagger);
   (`isStop;     `lemmas))
 
-// Map from q-style attribute names to spacy
+// @private
+// @kind dictionary
+// @category nlpParserUtility
+// @desc Map from q-style attribute names to spacy
+// @type dictionary
 parser.i.q2spacy:(!). flip(
   (`likeEmail;  `like_email);
   (`likeNumber; `like_num);
@@ -55,68 +58,145 @@ parser.i.q2spacy:(!). flip(
   (`pennPOS;    `tag_);
   (`starts;     `idx))
 
-// Model inputs for spacy 'alpha' models
-parser.i.alphalang:(!). flip(
+// @private
+// @kind dictionary
+// @category nlpParserUtility
+// @desc Model inputs for spacy 'alpha' models
+// @type dictionary
+parser.i.alphaLang:(!). flip(
   (`ja;`Japanese);
   (`zh;`Chinese))
 
-// Create new parser
-// Valid opts : text keywords likeEmail likeNumber likeURL isStop tokens lemmas uniPOS pennPOS starts sentChars sentIndices spell
-parser.newParser:{[lang;opts]
-  opts:{distinct x,raze parser.i.depOpts x}/[colnames:opts];
-  disabled:`ner`tagger`parser except opts;
-  model:parser.i.newSubParser[lang;opts;disabled];
-  tokenAttrs:parser.i.q2spacy key[parser.i.q2spacy]inter opts;
-  pyParser:parser.i.parseText[model;tokenAttrs;opts;];
-  stopwords:(`$.p.list[model`:Defaults.stop_words]`),`$"-PRON-";
-  parser.i.runParser[pyParser;colnames;opts;stopwords]}
+// @private
+// @kind function
+// @category nlpParser
+// @desc Create a new parser
+// @param modelName {symbol} The spaCy model/language to use. 
+//   This must already be installed.
+// @param options {symbol[]} The fields the parser should return
+// @param disabled {symbol[]} The modules to be disabled
+// @returns {fn} a parser for the given language
+parser.i.newSubParser:{[modelName;options;disabled] 
+  checkLang:parser.i.alphaLang modelName;
+  lang:$[`~checkLang;`spacy;sv[`]`spacy.lang,modelName];
+  model:.p.import[lang][hsym$[`~checkLang;`load;checkLang]];
+  model:model . raze[$[`~checkLang;modelName;()];`disable pykw disabled];
+  if[`sbd in options;
+    pipe:$[`~checkLang;model[`:create_pipe;`sentencizer];.p.pyget`x_sbd];
+    model[`:add_pipe]pipe;
+    ];
+  if[`spell in options;
+    spacyTokens:.p.import[`spacy.tokens][`:Token];
+    if[not spacyTokens[`:has_extension]["hunspell_spell"]`;
+      spHun:.p.import[`spacy_hunspell]`:spaCyHunSpell;
+      platform:`$.p.import[`platform][`:system][]`;
+      osSys:$[`Darwin~platform;`mac;lower platform];
+      hunspell:spHun[model;osSys];
+      model[`:add_pipe]hunspell
+      ]
+    ];
+  model
+  }
 
-// Returns a parser for the given language
-parser.i.newSubParser:{[lang;opts;disabled] 
- chklng:parser.i.alphalang lang;
- model:.p.import[$[`~chklng;`spacy;sv[`]`spacy.lang,lang]][hsym$[`~chklng;`load;chklng]
-   ]. raze[$[`~chklng;lang;()];`disable pykw disabled];
-  if[`sbd in opts;model[`:add_pipe]$[`~chklng;model[`:create_pipe;`sentencizer];.p.pyget `x_sbd]];
-  if[`spell in opts;if[not .p.import[`spacy.tokens][`:Token][`:has_extension]["hunspell_spell"]`;
-   sphun:.p.import[`spacy_hunspell]`:spaCyHunSpell;hunspell:sphun[model;
-   $[`Darwin~syst:`$.p.import[`platform][`:system][]`;`mac;lower syst]];model[`:add_pipe]hunspell]];
- model}
+// @private
+// @kind function
+// @category nlpParserUtility
+// @desc Parser operations that must be done in q, or give better 
+//   performance in q
+// @param pyParser {fn} A projection to call the spacy parser
+// @param fieldNames {symbol[]} The field names the parser should return
+// @param options {symbol[]} The fields to compute
+// @param stopWords {symbol[]} The stopWords in the text
+// @param docs {string|string[]} The text being parsed
+// @returns {dictionary|table} The parsed document(s)
+parser.i.runParser:{[pyParser;fieldNames;options;stopWords;docs]
+  tab:parser.i.cleanUTF8 each docs;
+  parsed:parser.i.unpack[pyParser;options;stopWords]each tab;
+  if[`keywords in options;parsed[`keywords]:TFIDF parsed];
+  fieldNames:($[1=count fieldNames;enlist;]fieldNames) except `spell;
+  fieldNames#@[parsed;`text;:;tab]
+  }
 
-// Operations that must be done in q, or give better performance in q
-parser.i.runParser:{[pyParser;colnames;opts;stopwords;docs]
-  t:parser.i.cleanUTF8 each docs;
-  parsed:parser.i.unpack[pyParser;opts;stopwords]each t;
-  if[`keywords in opts;parsed[`keywords]:TFIDF parsed];
-  (($[1=count colnames;enlist;]colnames) except `spell)#@[parsed;`text;:;t]}
-
-// Operations that must be done in q, or give better performance in q
-parser.i.unpack:{[pyParser;opts;stopwords;text]
-  names:inter[key[parser.i.q2spacy],`sentChars`sentIndices;opts],`isPunct;
+// @private
+// @kind function
+// @category nlpParserUtility
+// @desc This handles operations such as casting/removing punctuation
+//   that need to be done in q, or for performance reasons are better in q
+// @param pyParser {fn} A projection to call the spaCy parser
+// @param options {symbol[]} The fields to include in the output
+// @param stopWords {symbol[]} The stopWords in the text
+// @param text {string} The text being parsed
+// @returns {dictionary} The parsed document
+parser.i.unpack:{[pyParser;options;stopWords;text]
+  names:inter[key[parser.i.q2spacy],`sentChars`sentIndices;options],`isPunct;
   doc:names!pyParser text;
+  // Cast any attributes which should be symbols
   doc:@[doc;names inter`tokens`lemmas`uniPOS`pennPOS;`$];
+  // If there are entities, cast them to symbols
   if[`entities in names;doc:.[doc;(`entities;::;0 1);`$]]
   if[`isStop in names;
-    if[`uniPOS  in names;doc[`isStop]|:doc[`uniPOS ]in i.stopUniPOS ];
+    if[`uniPOS in names;doc[`isStop]|:doc[`uniPOS]in i.stopUniPOS];
     if[`pennPOS in names;doc[`isStop]|:doc[`pennPOS]in i.stopPennPOS];
-    if[`lemmas  in names;doc[`isStop]|:doc[`lemmas ]in stopwords];
-  ];
+    if[`lemmas in names;doc[`isStop]|:doc[`lemmas]in stopWords];
+    ];
   doc:parser.i.removePunct parser.i.adjustIndices[text]doc;
-  if[`sentIndices in opts;
+  if[`sentIndices in options;
     doc[`sentIndices]@:unique:value last each group doc`sentIndices;
-    if[`sentChars in opts;doc[`sentChars]@:unique]
-  ];
-  @[doc;`;:;::]}
+    if[`sentChars in options;doc[`sentChars]@:unique]
+    ];
+  @[doc;`;:;::]
+  }
 
-// Python indexes into strings by char instead of byte, so must be modified to index a q string
+// @private
+// @kind function
+// @category nlpParserUtility
+// @desc This converts python indices to q indices in the text
+//   This has to be done because python indexes into strings by char instead
+//   of byte, so must be modified to index a q string
+// @param text {string} The text being parsed
+// @param doc {dictionary} The parsed document
+// @returns {dictionary} The document with corrected indices
 parser.i.adjustIndices:{[text;doc]
-  adj:cont-til count cont:where ($[1~count text;enlist;]text) within"\200\277";
-  if[`starts    in cols doc;doc[`starts   ]+:adj binr 1+doc`starts   ];
-  if[`sentChars in cols doc;doc[`sentChars]+:adj binr 1+doc`sentChars];
-  doc}
+  if[1~count text;text:enlist text];
+  // Any bytes following the first byte in UTF-8 multi-byte characters
+  // will be in the range 128-191. These are continuation bytes.
+  continuations: where text within "\200\277";
+  // To find a character's index in python,
+  // the number of previous continuation bytes must be subtracted
+  adjusted:continuations-til count continuations;
+  // Add to each index the number of continuation bytes which came before it
+  // This needs to add 1, as the string "“hello”" gives the 
+  // adjustedContinuations 1 1 7 7.
+  // If the python index is 1, 1 1 7 7 binr 1 gives back 0, so it needs to 
+  // check the index after the python index
+  if[`starts in cols doc;doc[`starts]+:adjusted binr 1+doc`starts];
+  if[`sentChars in cols doc;doc[`sentChars]+:adjusted binr 1+doc`sentChars];
+  doc
+  }
 
-// Removes punctuation and space tokens and updates indices
+// @private
+// @kind function
+// @category nlpParserUtility
+// @desc Removes punctuation and space tokens and updates indices
+// @param doc {dictionary} The parsed document
+// @returns {dictionary} The parsed document with punctuation removed
 parser.i.removePunct:{[doc]
-  doc:@[doc;key[parser.i.q2spacy]inter k:cols doc;@[;where not doc`isPunct]];
+  // Extract document attributes
+  attrs:cols doc;
+  doc:@[doc;key[parser.i.q2spacy]inter attrs;@[;where not doc`isPunct]];
   idx:sums 0,not doc`isPunct;
-  if[`sentIndices in k;doc:@[doc;`sentIndices;idx]];
-  doc _`isPunct}
+  if[`sentIndices in attrs;doc:@[doc;`sentIndices;idx]];
+  doc _`isPunct
+  }
+
+// @private
+// @kind function
+// @category nlpParserUtility
+// @desc Parse a URL into its constituent components
+// @param url {string} The URL to be decomposed into its components
+// @returns {string[]} The components which make up the 
+parser.i.parseURLs:{[url]
+  pyLambda:"lambda url: urlparse(url if seReg.match(url) ",
+    "else 'http://' + url)";
+  .p.eval[pyLambda;<]url
+  }
diff --git a/code/regex.q b/code/regex.q
index fd287c3..e8469d2 100644
--- a/code/regex.q
+++ b/code/regex.q
@@ -1,32 +1,223 @@
+// code/regex.q - Nlp regex utilities
+// Copyright (c) 2021 Kx Systems Inc
+//
+// Utilities for regular expresions
+
 \d .nlp
 
-re:.p.import`re
-regex.compile:{re[`:compile;x;$[y;re`:IGNORECASE;0]]}
-regex.matchAll:.p.eval["lambda p,t:[[x.group(),x.start(),x.end()]for x in p.finditer(t)]";<]
-regex.check:{i.bool[x[`:search]y]`}
-
-regex.patterns.specialChars:    "[-[\\]{}()*+?.,\\\\^$|#\\s]"
-regex.patterns.money:           "[$¥€£¤฿]?\\s*((?<![.0-9])([0-9][0-9, ]*(\\.([0-9]{0,2})?)?|\\.[0-9]{1,2})(?![.0-9]))\\s*((hundred|thousand|million|billion|trillion|[KMB])?\\s*([$¥€£¤฿]|dollars?|yen|pounds?|cad|usd|gbp|eur))|[$¥€£¤฿]\\s*((?<![.0-9])([0-9][0-9, ]*(\\.([0-9]{0,2})?)?|\\.[0-9]{1,2})(?![.0-9]))\\s*((hundred|thousand|million|billion|trillion|[KMB])\\s*([$¥€£¤฿]|dollars?|yen|pounds?|cad|usd|gbp|eur)?)?"
-regex.patterns.phoneNumber:     "\\b((\\+?\\s*\\(?[0-9]+\\)?[-. /]?)?\\(?[0-9]+\\)?[-. /]?)?[0-9]{3}[-. ][0-9]{4}(\\s*(x|ext\\s*.?|extension)[ .-]*[0-9]+)?\\b"
-regex.patterns.emailAddress:    "\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}\\b"
-regex.patterns.url:             "((https?|ftps?)://(www\\d{0,3}\\.)?|www\\d{0,3}\\.)[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^[:punct:]\\s]|/))"
-regex.patterns.zipCode:         "\\b\\d{5}\\b"
-regex.patterns.postalCode:      "\\b[a-z]\\d[a-z] ?\\d[a-z]\\d\\b"
-regex.patterns.postalOrZipCode: "\\b(\\d{5}|[a-z]\\d[a-z] ?\\d[a-z]\\d)\\b"
-regex.patterns.dtsep:           "[\\b(of |in )\\b\\t .,-/\\\\]+"
-regex.patterns.day:             "\\b[0-3]?[0-9](st|nd|rd|th)?\\b"
-regex.patterns.month:           "\\b([01]?[0-9]|jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)\\b"
-regex.patterns.year:            "\\b([12][0-9])?[0-9]{2}\\b"
-regex.patterns.yearfull:        "\\b[12][0-9]{3}\\b"
-regex.patterns.am:              "(a[.\\s]?m\\.?)"
-regex.patterns.pm:              "(p[.\\s]?m\\.?)"
-regex.patterns.time12:          "\\b[012]?[0-9]:[0-5][0-9](h|(:[0-5][0-9])([.:][0-9]{1,9})?)?\\s*(",sv["|";regex.patterns`am`pm],")?\\b"
-regex.patterns.time24:          "\\b[012][0-9][0-5][0-9]h\\b"
-regex.patterns.time:            "(",sv["|";regex.patterns`time12`time24],")"
-regex.patterns.yearmonthList:   "(",sv["|";regex.patterns`year`month    ],")"
-regex.patterns.yearmonthdayList:"(",sv["|";regex.patterns`year`month`day],")"
-regex.patterns.yearmonth:       "(",sv[regex.patterns.dtsep;2#enlist regex.patterns.yearmonthList   ],")"
-regex.patterns.yearmonthday:    "(",sv[regex.patterns.dtsep;3#enlist regex.patterns.yearmonthdayList],")"
-
-regex.objects:regex.compile[;1b]each 1_regex.patterns
+// @private
+// @kind function
+// @category nlpRegexUtility
+// @desc Import the regex module from python
+regex.i.re:.p.import`re
+
+// @private
+// @kind function
+// @category nlpRegexUtility
+// @desc Check if a pattern occurs in the text
+// @params patterns {<} A regex pattern as an embedPy object
+// @params text {string} A piece of text
+// @returns {boolean} Indicate whether or not the pattern is present in the
+//   text 
+regex.i.check:{[patterns;text]
+  i.bool[patterns[`:search]text]`
+  }
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of special characters
+// @type string
+regex.i.patterns.specialChars:"[-[\\]{}()*+?.,\\\\^$|#\\s]"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of special characters
+// @type string
+regex.i.patterns.money:"[$¥€£¤฿]?\\s*((?<![.0-9])([0-9][0-9, ]*(\\.",
+  "([0-9]{0,2})?)?|\\.[0-9]{1,2})(?![.0-9]))\\s*((hundred|thousand|million",
+  "|billion|trillion|[KMB])?\\s*([$¥€£¤฿]|dollars?|yen|pounds?|cad|usd|",
+  "gbp|eur))|[$¥€£¤฿]\\s*((?<![.0-9])([0-9][0-9, ]*(\\.([0-9]{0,2})?)?|",
+  "\\.[0-9]{1,2})(?![.0-9]))\\s*((hundred|thousand|million|billion|",
+  "trillion|[KMB])\\s*([$¥€£¤฿]|dollars?|yen|pounds?|cad|usd|gbp|eur)?)?"
+
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of phone number characters
+// @type string
+regex.i.patterns.phoneNumber:"\\b((\\+?\\s*\\(?[0-9]+\\)?[-. /]?)?\\(?[0-9]+",
+  "\\)?[-. /]?)?[0-9]{3}[-. ][0-9]{4}(\\s*(x|ext\\s*.?|extension)[ .-]*[0-9]",
+  "+)?\\b"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of email address characters
+// @type string
+regex.i.patterns.emailAddress:"\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}\\b"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of url characters
+// @type string
+regex.i.patterns.url:"((https?|ftps?)://(www\\d{0,3}\\.)?|www\\d{0,3}\\.)",
+  "[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^[:punct:]\\s]|/))"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of zipcode characters
+// @type string
+regex.i.patterns.zipCode:"\\b\\d{5}\\b"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of postal code characters
+// @type string
+regex.i.patterns.postalCode:"\\b[a-z]\\d[a-z] ?\\d[a-z]\\d\\b"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of postal or zip code characters
+// @type string
+regex.i.patterns.postalOrZipCode:"\\b(\\d{5}|[a-z]\\d[a-z] ?\\d[a-z]\\d)\\b"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of date separator characters
+// @type string
+regex.i.patterns.dateSeparate:"[\\b(of |in )\\b\\t .,-/\\\\]+"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of date characters
+// @type string
+regex.i.patterns.day:"\\b[0-3]?[0-9](st|nd|rd|th)?\\b"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of monthly characters
+// @type string
+regex.i.patterns.month:"\\b([01]?[0-9]|jan(uary)?|feb(ruary)?|mar(ch)?|",
+  "apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?",
+  "|dec(ember)?)\\b"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of yearly characters
+// @type string
+regex.i.patterns.year:"\\b([12][0-9])?[0-9]{2}\\b"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of year characters in full
+// @type string
+regex.i.patterns.yearFull:"\\b[12][0-9]{3}\\b"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of am characters
+// @type string
+regex.i.patterns.am:"(a[.\\s]?m\\.?)"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of pm characters
+// @type string
+regex.i.patterns.pm:"(p[.\\s]?m\\.?)"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of time (12hr) characters
+// @type string
+regex.i.patterns.time12:"\\b[012]?[0-9]:[0-5][0-9](h|(:[0-5][0-9])([.:][0-9]",
+  "{1,9})?)?\\s*(",sv["|";regex.i.patterns`am`pm],")?\\b"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of time (24hr) characters
+// @type string
+regex.i.patterns.time24:"\\b[012][0-9][0-5][0-9]h\\b"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of all time characters
+// @type string
+regex.i.patterns.time:"(",sv["|";regex.i.patterns`time12`time24],")"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of year/month characters as a list
+// @type string
+regex.i.patterns.yearMonthList:"(",sv["|";regex.i.patterns`year`month],")"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of year/month/date characters
+// @type string
+regex.i.patterns.yearMonthDayList:"(",sv["|";
+  regex.i.patterns`year`month`day],")"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of year/month characters along with date separators
+// @type string
+regex.i.patterns.yearMonth:"(",sv[regex.i.patterns.dateSeparate;
+  2#enlist regex.i.patterns.yearMonthList],")"
+
+// @private
+// @kind data
+// @category nlpRegexUtilityPattern
+// @desc A string of year/month/date characters along with date
+//   separators
+// @type string
+regex.i.patterns.yearMonthDay:"(",sv[regex.i.patterns.dateSeparate;
+  3#enlist regex.i.patterns.yearMonthDayList],")"
+
+// @kind function
+// @category nlpRegex
+// @desc Compile a regular expression pattern into a regular 
+//   expression embedPy object which can be used for matching
+// @params patterns {string} A regex pattern
+// @params ignoreCase {boolean} Whether the case of the string is to be ignored
+// @return {<} The compiled regex object
+regex.compile:{[patterns;ignoreCase]
+  case:$[ignoreCase;regex.i.re`:IGNORECASE;0];
+  regex.i.re[`:compile;patterns;case]
+  }
+
+// @kind function
+// @category nlpRegex
+// @desc Finds all the matches in a string of text
+// @params patterns {<} A regex pattern as an embedPy object
+// @params text {string} A piece of text
+// @returns {::|string[]} If the pattern is not present in the text a null
+//   is returned. Otherwise, the pattern along with the index where the 
+//   pattern begins and ends is returned
+regex.matchAll:.p.eval["lambda p,t:[[x.group(),x.start(),x.end()]",
+  "for x in p.finditer(t)]";<]
 
+// @kind function
+// @category nlpRegex
+// @desc Compile all patterns into regular expression objects
+// @return {<} The compiled regex object
+regex.objects:regex.compile[;1b]each 1_regex.i.patterns
diff --git a/code/sent.q b/code/sent.q
index 8e977a1..fc92ce8 100644
--- a/code/sent.q
+++ b/code/sent.q
@@ -1,98 +1,210 @@
+// code/email.q - Nlp sentiment utilities
+// Copyright (c) 2021 Kx Systems Inc
+//
+// Utilities for sentiment analysis 
+
 \d .nlp
 
-// Create regex used for tokenizing
+// @private
+// @kind function
+// @category nlpSentUtility
+// @desc Create a regex patterns used for tokenization
+// @returns {<} The compiled regex object
 sent.i.tokenPattern:{
-  rightFacingEmoticons:"[<>]?[:;=8][\\-o\\*\\']?[\\)\\]\\(\\[dDpP/\\:\\}\\{@\\|\\\\]"; / n.b. Left-facing rarely used
-  miscEmoticons:"<3|[0o][._][0o]|</3|\\\\o/|[lr]&r|j/[jkptw]|\\*\\\\0/\\*|v\\.v|o/\\\\o";
+  rightFacingEmoticons:"[<>]?[:;=8][\\-o\\*\\']?[\\)\\]\\(\\[dDpP/\\:\\}\\{@",
+    "\\|\\\\]"; / n.b. Left-facing rarely used
+  miscEmoticons:"<3|[0o][._][0o]|</3|\\\\o/|[lr]&r|j/[jkptw]|\\*\\\\0/\\*|v\\",
+    ".v|o/\\\\o";
   urlStart:"https?://";
   // Match any words
-  word:"\\b(?:the shit|the bomb|bad ass|yeah right|cut the mustard|kiss of death|hand to mouth|sort of|kind of|kind-of|sort-of|cover-up|once-in-a-lifetime|self-confident|short-sighted|short-sightedness|son-of-a-bitch)\\b|[\\w]{2,}(?:'[ts])?";
-  regex.compile[;1b]"(?:",urlStart,"|",rightFacingEmoticons,"|",miscEmoticons,"|",word,")"
- }[]
-
-// Tokenizer specifically for sentiment analyzer (won't work for general purpose tokenizing)
-sent.i.tokenize:{`$regex.matchAll[sent.i.tokenPattern;x][;0]}
+  word:"\\b(?:the shit|the bomb|bad ass|yeah right|cut the mustard|kiss of ",
+    "death|hand to mouth|sort of|kind of|kind-of|sort-of|cover-up|once-in-a-",
+    "lifetime|self-confident|short-sighted|short-sightedness|son-of-a-bitch)",
+    "\\b|[\\w]{2,}(?:'[ts])?";
+  text:"(?:",urlStart,"|",rightFacingEmoticons,"|",miscEmoticons,"|",word,")";
+  regex.compile[;1b]text
+  }[]
 
-// Start indices of occurences of seq in list (faster than looping over list for each element)
-sent.i.findSequence:{[list;seq]neg[count seq]+{[list;i;x]1+i where x=list i}[list]/[til count list;seq]}
+// @private
+// @kind function
+// @category nlpSentUtility
+// @desc Tokenizer specifically for sentiment analyzer 
+//   (won't work for general purpose tokenizing)
+// @param text {string} The text to be tokenized
+// @returns {symbol[]} The tokens of the text 
+//   (each word/emoticon ends up in its own token)
+sent.i.tokenize:{[text]
+  `$regex.matchAll[sent.i.tokenPattern;text][;0]
+  }
 
-// Inc mean sentiment intensity rating from '!' (up to 4)
-// Empirically derived mean sentiment intensity rating increase for exclamation points
-sent.i.amplifyEP:{.292*4&sum"!"=x}
+// @private
+// @kind function
+// @category nlpSentUtility
+// @desc Check for added emphasis resulting from exclamation points 
+//   (up to 4 of them) using empirically derived mean sentiment intensity. 
+//   Ratings increase for exclamation points
+// @param text {string} The complete sentence
+// @returns {float} An amount to increase the sentiment by
+sent.i.amplifyEP:{[text]
+  .292*4&sum"!"=text
+  }
 
-// Inc mean sentiment intensity rating from '?' (up to 4)
-// Empirically derived mean sentiment intensity rating increases for question marks
-sent.i.amplifyQM:{(0 0 .36 .54 .96)4&sum"?"=x}
+// @private
+// @kind function
+// @category nlpSentUtility
+// @desc Check for added emphasis resulting from question marks 
+//   (2 or 3+) using empirically derived mean sentiment intensity rating. 
+//   Ratings increases for question marks
+// @param text {string} The complete sentence
+// @returns {float} An amount to increase the sentiment by
+sent.i.amplifyQM:{[text]
+  (0 0 .36 .54 .96)4&sum"?"=text
+  }
 
-// Increase valences (weights) for booster words e.g. "really", "very"
+// @private 
+// @kind data
+// @category nlpSentUtility
+// @desc Positive booster words. This increases positive valences
+// @type symbol[]
 sent.i.posBoosters:`$(
-  "absolutely"; "amazingly"; "awfully"; "completely"; "considerably"; "decidedly"; "deeply";
-  "effing"; "enormously"; "entirely"; "especially"; "exceptionally"; "extremely"; "fabulously";
-  "flipping"; "flippin"; "fricking"; "frickin"; "frigging"; "friggin"; "fully"; "fucking";
-  "greatly"; "hella"; "highly"; "hugely"; "incredibly"; "intensely"; "majorly"; "more"; "most";
-  "particularly"; "purely"; "quite"; "really"; "remarkably"; "so"; "substantially"; "thoroughly";
-  "totally"; "tremendously"; "uber"; "unbelievably"; "unusually"; "utterly"; "very");
+  "absolutely";"amazingly";"awfully";"completely";"considerably";"decidedly";
+  "deeply";"effing";"enormously";"entirely";"especially";"exceptionally";
+  "extremely";"fabulously";"flipping";"flippin";"fricking";"frickin";
+  "frigging";"friggin";"fully";"fucking";"greatly";"hella";"highly";"hugely";
+  "incredibly";"intensely";"majorly";"more";"most";"particularly";"purely";
+  "quite";"really";"remarkably";"so";"substantially";"thoroughly";"totally";
+  "tremendously";"uber";"unbelievably";"unusually";"utterly";"very");
+
+// @private
+// @kind data
+// @category nlpSentUtility
+// @desc Negative booster words. This increase negative valences 
+// @type symbol[]
 sent.i.negBoosters:`$(
-  "almost"; "barely"; "hardly"; "just enough"; "kind of"; "kinda"; "kindof"; "kind-of"; "less";
-  "little"; "marginally"; "occasionally"; "partly"; "scarcely"; "slightly"; "somewhat"; "sort of";
-  "sorta"; "sortof"; "sort-of");
-sent.i.BOOSTER_INCR: .293
-sent.i.ALLCAPS_INCR: .733
-sent.i.Boosters:(!). flip(sent.i.posBoosters,\:sent.i.BOOSTER_INCR),(sent.i.negBoosters,\:neg sent.i.BOOSTER_INCR)
+  "almost";"barely";"hardly";"just enough";"kind of";"kinda";"kindof";
+  "kind-of";"less";"little";"marginally";"occasionally";"partly";"scarcely";
+  "slightly";"somewhat";"sort of";"sorta";"sortof";"sort-of");
+
+// @private
+// @kind data
+// @category nlpSentUtility
+// @desc The co-efficient how much boosters increase sentiment
+// @type float
+sent.i.BOOSTER_INCR:.293
+
+// @private
+// @kind data
+// @category nlpSentUtility
+// @desc The co-efficient how much allcaps increase sentiment
+// @type float
+sent.i.ALLCAPS_INCR:.733
+
+// @private
+// @kind data
+// @category nlpSentUtility
+// @desc A dictionary mapping all possible boosters
+//   to their associated values
+// @type dictionary
+sent.i.Boosters:(!). flip(sent.i.posBoosters,\:sent.i.BOOSTER_INCR),
+  (sent.i.negBoosters,\:neg sent.i.BOOSTER_INCR)
 
+// @private
+// @kind function
+// @category nlpSentUtility
+// @desc Add weight for "booster" words like "really", or "very"
+// @param tokens {symbol[]} The tokenized sentence
+// @param isUpperCase {boolean[]} A vector where an element is 1b if the 
+//   associated token is upper case
+// @param valences {float[]} The sentiment of each token
+// @returns {float} The modified valences
 sent.i.applyBoosters:{[tokens;isUpperCase;valences]
   weight:sent.i.Boosters tokens;
   // Inc degree of capitalized boosters
-  weight[wup]+:sent.i.ALLCAPS_INCR*signum weight wup:where isUpperCase;
-  // Add weight to next 3 tokens (add/remove 3 dummy vals in case booster is last token)
-  boosts:-3_@[(3+count valences)#0f;i+/:1 2 3;+;weight[i:where not null weight]*/:1 .95 .9];
+  whereUpper:where isUpperCase;
+  weight[whereUpper]+:sent.i.ALLCAPS_INCR*signum weight whereUpper;
+  // Add weight to next 3 tokens (add/remove 3 dummy vals in case booster 
+  // is last token)
+  boosts:-3_@[(3+count valences)#0f;i+/:1 2 3;+;
+    weight[i:where not null weight]*/:1 .95 .9];
   // Add extra weight
-  valences+boosts*signum valences}
-
-// Decrease weight of valences before "but", and increase the weight of valences after it
-sent.i.butCheck:{[tokens;valences]$[j:count[tokens]-i:tokens?`but;@[;til i;*;.5]@[;i+1+til j-1;*;1.5]@;]"f"$valences}
-
-// Check for idioms with associated sentiment
-sent.i.IDIOMS:flip(
-  (`the`shit; 3f);
-  (`the`bomb; 3f);
-  (`bad`ass; 1.5f);
-  (`yeah`right; -2f);
-  (`cut`the`mustard; 2f); 
-  (`kiss`of`death; -1.5f); 
-  (`hand`to`mouth; -2f));
-sent.i.idiomsCheck:{[tokens;valences]
-  indices:raze each 0 1 2 3+/:/:sent.i.findSequence[lower tokens]each sent.IDIOMS 0;
-  -3_@[;;:;]/[valences,3#0f;indices;sent.i.IDIOMS 1]}
-
-// Check if preceding words increase, decrease, or negate the valence
+  valences+boosts*signum valences
+  }
+
+// @private
+// @kind function
+// @category nlpSentUtility
+// @desc Decrease the weight of valences before "but", and increase 
+//   the weight of valences after it
+// @param tokens {symbol[]} The tokenized sentence
+// @param valences {number[]} The sentiment of each token
+// @returns {number[]} The modified valences
+sent.i.butCheck:{[tokens;valences]
+  valences:"f"$valences;
+  i:tokens?`but;
+  j:count[tokens]-i;
+  $[j;@[;til i;*;.5]@[;i+1+til j-1;*;1.5]@;]valences
+  }
+
+// @private
+// @kind data
+// @category nlpSentUtility
+// @desc These are terms that negate what follows them
+// @type symbol[]
 sent.i.NEGATE:`$(
-  "aint"; "arent"; "cannot"; "cant"; "couldnt"; "darent"; "didnt"; "doesnt";
-  "ain't"; "aren't"; "can't"; "couldn't"; "daren't"; "didn't"; "doesn't";
-  "dont"; "hadnt"; "hasnt"; "havent"; "isnt"; "mightnt"; "mustnt"; "neither";
-  "don't"; "hadn't"; "hasn't"; "haven't"; "isn't"; "mightn't"; "mustn't";
-  "neednt"; "needn't"; "never"; "none"; "nope"; "nor"; "not"; "nothing"; "nowhere";
-  "oughtnt"; "shant"; "shouldnt"; "uhuh"; "wasnt"; "werent";
-  "oughtn't"; "shan't"; "shouldn't"; "uh-uh"; "wasn't"; "weren't";
-  "without"; "wont"; "wouldnt"; "won't"; "wouldn't"; "rarely"; "seldom"; "despite")
-sent.i.N_SCALAR:-0.74 / Co-efficient for sentiments following negation
+  "aint";"arent";"cannot";"cant";"couldnt";"darent";"didnt";"doesnt";
+  "ain't";"aren't";"can't";"couldn't";"daren't";"didn't";"doesn't";
+  "dont";"hadnt";"hasnt";"havent";"isnt";"mightnt";"mustnt";"neither";
+  "don't";"hadn't";"hasn't";"haven't";"isn't";"mightn't";"mustn't";
+  "neednt";"needn't";"never";"none";"nope"; "nor";"not";"nothing"; 
+  "nowhere";"oughtnt";"shant";"shouldnt";"uhuh";"wasnt";"werent";
+  "oughtn't";"shan't";"shouldn't";"uh-uh";"wasn't";"weren't";"without";
+  "wont";"wouldnt";"won't";"wouldn't";"rarely";"seldom";"despite")
+
+// @private
+// @kind data
+// @category nlpSentUtility
+// @desc The co-efficient for sentiments following a negation
+// @type float
+sent.i.N_SCALAR:-0.74 
+
+// @private
+// @kind function
+// @category nlpSentUtility
+// @desc Check if the preceding words increase, decrease, 
+//   or negate the valence
+// @param tokens {symbol[]} The tokenized sentence
+// @param valences {float[]} The sentiment of each token
+// @returns {float} The modified valences
 sent.i.negationCheck:{[tokens;valences]
   valences,:3#0f;
   // "never so/as/this" act like boosters
-  posNever:where(tokens=`never)&(next next s)|next s:tokens in`so`as`this;
+  s:tokens in`so`as`this; 
+  posNever:where(tokens=`never)&(next next s)|next s; 
   valences:@[valences;posNever+/:2 3;*;1.5 1.25];
-  // tokens in NEGATE or ending in "n't"
+  // Tokens in NEGATE or ending in "n't"
   i:where(tokens in sent.i.NEGATE)|tokens like"*n't";
   valences:@[valences;1 2 3+\:i except posNever;*;sent.i.N_SCALAR];
-  // occurences of "least" that are not part of "at/very least"
+  // Occurences of "least" that are not part of "at/very least"
   j:where(tokens=`least)&not prev tokens in`at`very;
   valences:@[valences;j+1;*;sent.i.N_SCALAR];
-  -3_ valences}
+  -3_ valences
+  }
 
-// Load the dictionary of terms and their sentiment
-// Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media
-// Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
+// @private
+// @kind data
+// @category nlpSentUtility
+// @desc Load the dictionary of terms and their sentiment
+//   Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model
+//   for Sentiment Analysis of Social Media Text. Eighth International
+//   Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI,June 2014
+// @type dictionary
 sent.i.lexicon :(!).("SF";"\t")0: hsym `$.nlp.path,"/vader/lexicon.txt";
+
+// @private
+// @kind data
+// @category nlpSentUtility
+// @desc Additional lexicon sentiments
+// @type dictionary
 sent.i.lexicon,:(!). flip(
   (`$"the shit"; 3f);
   (`$"the bomb"; 3f);
@@ -102,16 +214,24 @@ sent.i.lexicon,:(!). flip(
   (`$"kiss of death"; -1.5f);
   (`$"hand to mouth"; -2f));
 
-// Calculate sentiment given individual valences
+// @private
+// @kind function
+// @category nlpSentUtility
+// @desc Calculate the sentiment, given the individual valences
+// @param valences {float[]} The sentiment of each token
+// @param text {string} A piece of text
+// @returns {dictionary} The sentiment of the text along the dimensions
+//   `pos`neg`neu and`compound
 sent.i.scoreValence:{[valences;text]
   if[not count valences;:`compound`pos`neg`neu!0 0 0 0f];
   compound:sum valences;
   // Punctuation can increase the intensity of the sentiment
-  compound+:signum[compound]*punctAmplifier:sent.i.amplifyEP[text]+sent.i.amplifyQM text;
+  punctAmplifier:sent.i.amplifyEP[text]+sent.i.amplifyQM text;
+  compound+:signum[compound]*punctAmplifier;
   // Normalize score
   compound:{x%sqrt 15+x*x}compound;
   // Discriminate between positive, negative and neutral sentiment scores
-  positive:sum  1+valences where valences>0;
+  positive:sum 1+valences where valences>0;
   negative:sum -1+valences where valences<0;
   neutral:count where valences=0;
   // If punctuation affects the sentiment, apply emphasis to dominant sentiment
@@ -119,16 +239,5 @@ sent.i.scoreValence:{[valences;text]
   if[positive<abs negative;negative-:punctAmplifier];
   // Used to noramlize the pos, neg and neutral sentiment
   total:positive+neutral+abs negative;
-  `compound`pos`neg`neu!(compound,abs(positive;negative;neutral)%total)}
-
-// Calculate sentiment of a sentence of short message
-sent.score:{[text]
-  valences:sent.i.lexicon tokens:lower rawTokens:sent.i.tokenize text;
-  isUpperCase:(rawTokens=upper rawTokens)& rawTokens<>tokens;
-  upperIndices:where isUpperCase & not all isUpperCase;
-  valences[upperIndices]+:sent.i.ALLCAPS_INCR*signum valences upperIndices;
-  valences:sent.i.applyBoosters[tokens;isUpperCase;valences];
-  valences:sent.i.negationCheck[tokens;valences];
-  valences:sent.i.butCheck[tokens;valences];
-  sent.i.scoreValence[0f^valences;text]}
-
+  `compound`pos`neg`neu!(compound,abs(positive;negative;neutral)%total)
+  }
diff --git a/code/utils.q b/code/utils.q
index 8f5f860..3c5412f 100644
--- a/code/utils.q
+++ b/code/utils.q
@@ -1,66 +1,201 @@
+// code/utils.q - NLP utilities
+// Copyright (c) 2021 Kx Systems Inc
+//
+// General nlp utility functions
+
 \d .nlp
 \l p.q
-{.p.import[`sys;x][:;`:write;{x y;count y}y]}'[`:stdout`:stderr;1 2]; / redundant in latest embedPy
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Import python functions
 i.np:.p.import`numpy
 i.str:.p.import[`builtins]`:str
 i.bool:.p.import[`builtins]`:bool
 
-// Fast sum list of dicts (3 experimentally determined optimal number iterations)
-i.fastSum:{[it;d]sum$[it;.z.s it-1;sum]each(ceiling sqrt count d)cut d}2
-
-// Replace empty dicts with (,`)!,0f 
-i.fillEmptyDocs:{[docs]$[98=type docs;0^docs;@[docs;i;:;count[i:where not count each docs]#enlist(1#`)!1#0f]]}
-
-// Given monotonic increasing int list, return runs of consecutive numbers
-i.findRuns:{(where x<>1+prev x)_ x@:where r|:next r:x=1+prev x}
-
-// Get all sentences for doc
-i.getSentences:{[doc](sublist[;doc`text]deltas@)each doc`sentChars}
-
-// Index of min element
-i.minIndex:{x?min x}
-
-// Index of max element
-i.maxIndex:{x?max x}
-
-// Calc harmonic mean
-i.harmonicMean:{1%avg 1%x}
-
-// Calc a vector's magnitude
-i.magnitude:{sqrt sum x*x}
-
-// Normalize list or dict so the highest value is 1f
-i.normalize:{x%max x}
-
-// Take largest N values
-i.takeTop:{[n;x]n sublist desc x}
-
-// Jaro distance of 2 strings
-i.jaro:{[s1;s2]
-  if[0=l1:count s1;:0f];
-  d:1|-1+floor .5*l1|l2:count s2;
-  k:l[0]+where each s1='sublist\:[flip l:deltas 0|til[l1]+/:(-1 1)*d]s2;
-  m:count i:$[1=count j:k[0;0]{x,(y except x)0}/1_k;where not null j:enlist[j];where not null j];
-  t:.5*sum s1[i]<>s2 asc j i;
-  avg(m%l1;m%l2;(m-t)%m)}
-
-// Jaro-Winkler distance of 2 strings
-i.jaroWinkler:{$[0.7<w:i.jaro[x;y];w+(sum mins(4#x)~'4#y)*.1*1-w;w]}
-
-// Generating symmetric matrix from triangle (ragged list)
-i.matrixFromRaggedList:{m+flip m:((til count x)#'0.),'.5,'x}
-
-// Parts-of-speech not useful as keywords
+// @private
+// @kind function
+// @category nlpUtility
+// @desc A fast way to sum a list of dictionaries in 
+//   certain cases
+// @param iter {long} The number of iterations. Note that within this
+//   library iter is set explicitly to 2 for all present invocations
+// @param dict {dictionary[]} A list of dictionaries
+// @returns {dictionary} The dictionary values summed together
+i.fastSum:{[iter;dict]
+  // Summing a large number of dictionaries is expensive if there are many 
+  // distinct keys.
+  // This splits them into groups, which have fewer distinct keys, and then 
+  // adds those groups.
+  dictGroup:(ceiling sqrt count dict)cut dict;
+  sum$[iter;.z.s iter-1;sum]each dictGroup
+  }[2]
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Replace empty dicts with (,`)!,0f 
+// @param docs {dictionary[]} Documents of text
+// @returns {dictionary[]} Any empty dictionaries are filled
+i.fillEmptyDocs:{[docs]
+  $[98=type docs;
+    0^docs;
+    @[docs;i;:;count[i:where not count each docs]#enlist(1#`)!1#0f]
+    ]
+  }
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Given a monotonically increasing list of integral numbers,
+//   this finds any runs of consecutive numbers
+// @param array {number[]} Array of values 
+// @returns {long[][]} A list of runs of consecutive indices
+i.findRuns:{[array]
+  prevVals:array=1+prev array;
+  inRun:where prevVals|next prevVals;
+  (where array<>1+prev array)_ array@:inRun
+  }
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Index of the first occurrence of the minimum
+//   value of an array
+// @param array {number[]} Array of values 
+// @return {number} The index of the minimum element of the array
+i.minIndex:{[array]
+  array?min array
+  }
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Index of the first occurrence of the maximum
+//   value of the array
+// @param array {number[]} Array of values 
+// @return {number} The index of the maximum element of the array
+i.maxIndex:{[array]
+  array?max array
+  }
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Calculate the harmonic mean
+// @param array {number[]} Array of values 
+// @returns {float} The harmonic mean of the input
+i.harmonicMean:{[array]
+  1%avg 1%array
+  }
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Calculate a vector's magnitude
+// @param array {number[]} Array of values 
+// @returns {float} The magnitude of the vector
+i.magnitude:{[array]
+  sqrt sum array*array
+  }
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Normalize a list or dictionary so the highest value is 1f
+// @param vals {float[]|dictionary} A list or dictionary of numbers
+// @returns {float[]|dictionary} The input, normalized
+i.normalize:{[vals]
+  vals%max vals
+  }
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Takes the largest N values
+// @param n {long} The number of elements to take
+// @param vals {any[]} A list of values
+// @returns {any[]} The largest N values
+i.takeTop:{[n;vals]
+  n sublist desc vals
+  }
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Calculate the Jaro similarity score of two strings
+// @param str1 {string|string[]} A string of text
+// @param str2 {string|string[]} A string of text
+// @returns {Float} The similarity score of two strings
+i.jaro:{[str1;str2]
+  lenStr1:count str1;
+  lenStr2:count str2;
+  if[0=lenStr1;:0f];
+  // The range to search for matching characters
+  range:1|-1+floor .5*lenStr1|lenStr2;
+  // The low end of each window
+  lowWin:deltas 0|til[lenStr1]+/:(-1 1)*range;
+  k:lowWin[0]+where each str1='sublist\:[flip lowWin]str2;
+  j:raze k[0;0]{x,(y except x)0}/1_k;
+  nonNull:where not null j;
+  n:count nonNull;
+  // Find the number of transpositions
+  trans:.5*sum str1[nonNull]<>str2 asc j nonNull;
+  avg(n%lenStr1;n%lenStr2;(n-trans)%n)
+  }
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Generating symmetric matrix from triangle (ragged list)
+//   This is used to save time when generating a matrix where the upper 
+//   triangular component is the mirror of the lower triangular component
+// @param raggedList {float[][]} A list of lists of floats representing
+//   an upper triangular matrix where the diagonal values are all 0.
+//   eg. (2 3 4f; 5 6f; 7f) for a 4x4 matrix
+// @returns {float[][]} An n x n two dimensional array
+//   The input, mirrored across the diagonal, with all diagonal values being 1
+i.matrixFromRaggedList:{[raggedList]
+  // Pad the list with 0fs to make it an array,and set the diagonal values to 
+  // .5 which become 1 when the matrix is added to its flipped value
+  matrix:((til count raggedList)#'0.),'.5,'raggedList;
+  matrix+flip matrix
+  }
+
+// @private
+// @kind data
+// @category nlpUtility
+// @desc Parts-of-speech not useful as keywords
+// @type symbol[]
 i.stopUniPOS:asc`ADP`PART`AUX`CONJ`DET`SYM`NUM`PRON`SCONJ
-i.stopPennPOS:asc`CC`CD`DT`EX`IN`LS`MD`PDT`POS`PRP`SYM`TO`WDT`WP`WRB`,`$("PRP$";"WP$";"$") /add in ` for symbols
-
-// Parse urls
-p)from urllib.parse import urlparse
-p)import re
-p)seReg=re.compile('([a-z0-9]+:)?//')
-i.parseURLs:.p.eval["lambda url: urlparse(url if seReg.match(url) else 'http://' + url)";<]
-
-// Calc cosine similarity between doc and entire corpus
-i.compareDocToCorpus:{[keywords;idx]compareDocs[keywords idx]each(idx+1)_ keywords}
-
-
+i.stopPennPOS:asc`CC`CD`DT`EX`IN`LS`MD`PDT`POS`PRP`SYM`TO`WDT`WP`WRB`,
+  `$("PRP$";"WP$";"$")
+
+// @private
+// @kind function
+// @category nlpUtility
+// @desc Get the count of individual terms in a corpus
+// @param parsedTab {table} A parsed document containing keywords and their
+//   associated significance scores
+// @returns {dictionary} The count of terms in the corpus
+i.getTermCount:{[parsedTab]
+  tokens:parsedTab[`tokens]@'where each not parsedTab`isStop;
+  i.fastSum{1+log count each group x}each tokens
+  }
+
+// @kind function
+// @category nlpUtility
+// @desc Calculate the probability of words appearing in a text
+// @param tokens {symbol[]} The tokens in the text
+// @param occurance {dictionary} The total times a token appears in the text
+// @param token {symbol} A single token
+// @param nextToken {symbol} The next token in the list of tokens
+// @returns {dictionary} The probability that the secondary word in the 
+//   sequence follows the primary word.
+i.biGram:{[tokens;occurance;token;nextToken]
+  returnKeys:enlist(token;nextToken);
+  countToken:count where nextToken=tokens 1+where token=tokens;
+  returnVals:countToken%occurance[token];
+  returnKeys!enlist returnVals
+  }
diff --git a/init.q b/init.q
index 33eb05c..f605221 100644
--- a/init.q
+++ b/init.q
@@ -1,3 +1,6 @@
+// init.q - Load nlp libraries
+// Copyright (c) 2021 Kx Systems Inc
+
 path:{string`nlp^`$@[{"/"sv -1_"/"vs ssr[;"\\";"/"](-3#get .z.s)0};`;""]}`
 system"l ",path,"/","nlp.q"
 
@@ -6,9 +9,11 @@ system"l ",path,"/","nlp.q"
 loadfile`:code/utils.q
 loadfile`:code/regex.q
 loadfile`:code/sent.q
+loadfile`:code/parser.p
 loadfile`:code/parser.q
-loadfile`:code/date_time.q
+loadfile`:code/dateTime.q
+loadfile`:code/extractRtf.p
 loadfile`:code/email.q
 loadfile`:code/cluster.q
-loadfile`:code/nlp_code.q
+loadfile`:code/nlpCode.q
 
diff --git a/nlp.q b/nlp.q
index aba5942..2560ff7 100644
--- a/nlp.q
+++ b/nlp.q
@@ -1,3 +1,8 @@
+// nlp.q - Setup for nlp namespace
+// Copyright (c) 2021 Kx Systems Inc
+//
+// Define version, path, and loadfile 
+
 \d .nlp
 version:@[{NLPVERSION};0;`development]
 path:{string`nlp^`$@[{"/"sv -1_"/"vs ssr[;"\\";"/"](-3#get .z.s)0};`;""]}`
diff --git a/tests/filelength.t b/tests/filelength.t
new file mode 100644
index 0000000..1b0d821
--- /dev/null
+++ b/tests/filelength.t
@@ -0,0 +1,36 @@
+// Names of the folders containing q scripts whose line lengths are to be tested
+folders:enlist"code"
+
+// Function for retrieval of all q files
+getFiles:{
+  files:key hsym `$x;
+  pathStem:x,"/";
+  qfiles:files where files like "*.q";
+  `$pathStem,/:string qfiles
+  }
+
+// List of all the q files within the appropriate folders
+files:raze getFiles each folders
+
+// For an individual file test that the lines of the file do no exceed 80 characters unless
+// exempt using '// noqa' at the end of the line
+testLineLength:{[file]
+  fileContent:read0 hsym file;
+  excessCharacters:80<count each fileContent;
+  excessLocations:where excessCharacters;
+  excessContent:trim fileContent excessLocations;
+  testFail:lineTest[file]'[excessLocations;excessContent];
+  $[any testFail;[-1"";0b];1b]
+  }
+
+// Check that an individual line conforms with the acceptable line length, return 0b
+// if there are no issues, print line information and return 1b otherwise. Note lines
+// with a following '// noqa' will be ignored
+lineTest:{[file;loc;line]
+  // find all lines that are not exempt from unit tests i.e. don't end with
+  // '// noqa' these are ignored from the line length tests
+  exempt:"\/\/ noqa"~-7#line;
+  $[exempt;0b;[-1 "Line: ",string[loc+1]," File: '",string[file],"' Content: ",line;1b]]
+  }
+
+all testLineLength each files
diff --git a/tests/nlptest.t b/tests/nlptest.t
index 53bf1fa..12885df 100644
--- a/tests/nlptest.t
+++ b/tests/nlptest.t
@@ -15,9 +15,6 @@ keywords[0; `billion] > keywords[0; `transacting]
 enlist[(`u#`$())!()]~TFIDF([]tokens:enlist `$(); isStop:enlist `boolean$());
 keywords:TFIDF enlist corpus 1;
 98h~type keywords
-keywords_tot:TFIDF_tot corpus
-keywords_tot[`erv]~keywords_tot[`published]
-keywords_tot[`mpr] > keywords_tot[`attached]
 p:newParser[`en;`keywords];
 corpus:p text;
 1f~compareDocs . corpus[`keywords]0 0
@@ -96,10 +93,10 @@ getSentences[first sentenceParser enlist"This is my sentence"]~enlist "This is m
 (getSentences first sentenceParser enlist "There's no train to Guysborough. Though I know there'll be one in time")~("There's no train to Guysborough."; "Though I know there'll be one in time")
 truncate:{[precision; x]coefficient: 10 xexp precision;reciprocal[coefficient]*`long$coefficient*x}
 /jaroWinkler
-all(.961~truncate[3] i.jaroWinkler["martha";"marhta"];.840~truncate[3] i.jaroWinkler["dwayne"; "duane"];.813~truncate[3] i.jaroWinkler["dixon";"dicksonx"];.743~truncate[3] i.jaroWinkler["johnson"; "jannsen"];.562~truncate[3] i.jaroWinkler["johnson";"jannsenberg"];.906~truncate[3] i.jaroWinkler["aahahahahahahhaahah"; "ahahahahhahahahahaha"])   
-all(0f~i.jaroWinkler["benjamin";enlist"z"];0f~i.jaroWinkler["benjamin";enlist"a"])
-all(0f~i.jaroWinkler["";enlist"a"];0f~i.jaroWinkler["ben";""])
-.75~i.jaroWinkler["abcd"; enlist "b"]
+all(.961~truncate[3] jaroWinkler["martha";"marhta"];.840~truncate[3] jaroWinkler["dwayne"; "duane"];.813~truncate[3] jaroWinkler["dixon";"dicksonx"];.743~truncate[3] jaroWinkler["johnson"; "jannsen"];.562~truncate[3] jaroWinkler["johnson";"jannsenberg"];.906~truncate[3] jaroWinkler["aahahahahahahhaahah"; "ahahahahhahahahahaha"])   
+all(0f~jaroWinkler["benjamin";enlist"z"];0f~jaroWinkler["benjamin";enlist"a"])
+all(0f~jaroWinkler["";enlist"a"];0f~jaroWinkler["ben";""])
+.75~jaroWinkler["abcd"; enlist "b"]
 p:newParser[`en; `tokens`isStop];
 corpus:p text;       
 (()!())~keywordsContinuous 0#corpus  
@@ -109,9 +106,9 @@ keywords:keywordsContinuous enlist doc;
 99h ~ type keywords
 keywords:keywordsContinuous corpus;
 {x~desc x} keywords `chairman`chief`group`enron`thanks`mountains
-(1 1f,(2%3),(1%3),0.5 0.5 0.5 0.5 0.5 0.5)~value 10#ngram[enlist first corpus;2]
-1 1 .5 .5 1 1 1 1 1 1f~value 10#ngram[enlist first corpus;3]
-((`enrononline`management`report);(`management`report`june);(`report`june`attached))~key 3#ngram[enlist first corpus;3]
+(1 1f,(2%3),(1%3),0.5 0.5 0.5 0.5 0.5 0.5)~value 10#nGram[enlist first corpus;2]
+1 1 .5 .5 1 1 1 1 1 1f~value 10#nGram[enlist first corpus;3]
+((`enrononline`management`report);(`management`report`june);(`report`june`attached))~key 3#nGram[enlist first corpus;3]
 emails:email.loadEmails["tests/data/test.mbox"]
 `sender`to`date`subject`contentType`payload`text~cols emails
 (last emails`text)~"Your email client does not support HTML mails."
@@ -124,16 +121,16 @@ parseURLs["https://www.google.ca:1234/test/index.html;myParam?foo=bar&quux=blort
 all(parseURLs["google.ca/test/index.html"][`scheme`domainName`path]~("http";"google.ca";"/test/index.html");parseURLs["www.google.co.uk"][`scheme`domainName`path]~("http";"www.google.co.uk";""))
 parseURLs["https://网站.中国.com"]~`scheme`domainName`path`parameters`query`fragment!("https";"网站.中国.com";"";"";"";"")
 (parseURLs each ("https://travel.gc.ca/";"https://www.canada.ca/en/revenue-agency.html"))~([]scheme:("https"; "https");domainName:("travel.gc.ca"; "www.canada.ca");path:(enlist "/";"/en/revenue-agency.html");parameters: (""; "");query:(""; "");fragment:(""; ""))
-seq:bi_gram[corpus]
+seq:biGram[corpus]
 seq[`enrononline`management]~1f
 seq[`management`report]>seq[`report`june]
 `en~detectLang["This is a sentence"]
 `de~detectLang["Das ist ein Satz"]
 `fr~detectLang["C'est une phrase"]
-ascii["This is ä senteñcê"]~"This is  sentec"
+removeNonAscii["This is ä senteñcê"]~"This is  sentec"
 rmv_list   :("http*";"*,";"*&*";"*[0-9]*")
-rmv_custom["https//:google.com & https//:bing.com are 2 search engines!";rmv_list]~"are search engines!"
-rmv_main["https//:google.com & https//:bing.com are 2 search engines!";",.:?!/@'\n";""]~"httpsgooglecom & httpsbingcom are 2 search engines"
+removeCustom["https//:google.com & https//:bing.com are 2 search engines!";rmv_list]~"are search engines!"
+removeReplace["https//:google.com & https//:bing.com are 2 search engines!";",.:?!/@'\n";""]~"httpsgooglecom & httpsbingcom are 2 search engines"
 loadDir:loadTextFromDir["tests/data/test.mbox"]
 `fileName`path`text~cols loadDir
 loadDir[`fileName]~enlist `test.mbox
@@ -144,5 +141,5 @@ phonecall:corpus n:where corpus[`text] like "*Telephone Call*"
 remaining:corpus til[count corpus]except n
 (`message`murdock`erica`error`jerry;`enron`know`let,`meeting`company)~key each 5#/:compareCorpora[phonecall;remaining]
 txt:"You can call the number 123 456 7890 or email us on name@email.com in book an appoinment for January,February and March for £30.00"
-findRegex[txt;`phoneNumber`emailAddress`yearmonthList`money]~`phoneNumber`emailAddress`yearmonthList`money!(enlist (" 123 456 7890";23;36);enlist("name@email.com";52;66);(("January";93;100);("February";101;109);("March";114;119);("30";125;127);("00";128;130));enlist("\302\24330.00";124;130))
+findRegex[txt;`phoneNumber`emailAddress`yearMonthList`money]~`phoneNumber`emailAddress`yearMonthList`money!(enlist (" 123 456 7890";23;36);enlist("name@email.com";52;66);(("January";93;100);("February";101;109);("March";114;119);("30";125;127);("00";128;130));enlist("\302\24330.00";124;130))
 \d .
diff --git a/tests/senttest.t b/tests/senttest.t
index bb0d84d..bc9ffa2 100644
--- a/tests/senttest.t
+++ b/tests/senttest.t
@@ -7,19 +7,11 @@ sent.i.amplifyEP[enlist "!"]~.292
 sent.i.amplifyQM[""]~0f
 sent.i.amplifyQM[enlist "?"]~0f
 0 0 0.36 0.54 0.96 0.96~sent.i.amplifyQM each ("yes"; "oh?"; "oh? really?"; "you don't say???"; "forsooth????"; "????????????")
-all (sent.i.findSequence[`a`b`c`d;enlist`c]~enlist 2;sent.i.findSequence[`c`b`c`d; enlist `c] ~ 0 2)
-all (sent.i.findSequence[`a`b`c`d`e`f;`c`d]~enlist 2;sent.i.findSequence[`a`b`c`d`e`f`c`d; `c`d] ~ 2 6;sent.i.findSequence[`a`b`c`d`e`f`a`b`c`d`e`g`a`b`c`d; `a`b`c`d] ~ 0 6 12)
-sent.i.findSequence[`a`b`c`d;`c]~enlist 2;
-sent.i.findSequence[`$();`a`b`c]~`long$()
-all(sent.i.findSequence[enlist`a;`a]~enlist 0;sent.i.findSequence[enlist`a;`b]~`long$())
-sent.i.findSequence[`a`b`c`d`e`a;`a]~0 5
-sent.i.findSequence[0 0 4 5 1 2 4 5;4 5]~2 6
-sent.i.findSequence["Facebook,Tim Cook";"oo"]~5 14
 sent.i.butCheck[`$(); `float$()] ~ `float$()
 all(sent.i.butCheck[enlist `good; enlist 2f] ~ enlist 2f;sent.i.butCheck[enlist`but;enlist 0f]~enlist 0f)
 all(sent.i.butCheck[`that`was`good`but; 0 0 1 0f] ~ 0 0 .5 0f;sent.i.butCheck[`that`was`good`but`it; 0 0 1 0 0f] ~ 0 0 .5 0 0f;sent.i.butCheck[`but`it`was`ok; 0 0 0 1f] ~ 0 0 0 1.5f;sent.i.butCheck[`tasty`but`it`smelled`bad; 2 0 0 -1.5 -2f] ~ 1 0 0 -2.25 -3f)
 sent.i.butCheck[`it`was`good`and`useful`but`boring`and`gross;0 0 1 0 1.5 0 -1 0 -2]~0 0 .5 0 .75 0 -1.5 0 -3
-compare:{value (floor 1000* sent.score x) % 1000}
+compare:{value (floor 1000* sentiment x) % 1000}
 all(compare[""]~0 0 0 0f;compare["\t\t\r\n\n"]~0 0 0 0f;compare["a  b  c 1"]~0 0 0 0f)
 all(compare["bad"]~-.543 0 1 0f;compare["racist"]~-.613 0 1 0f;compare["good"]~.44 1 0 0f;compare["free"] ~.51 1 0 0f;compare["those"]~0 0 0 1f;compare["123"]~0 0 0 1f)
 all(compare["ugly smile"]~-0.203 0.431 0.568 0;compare["free sadness"]~0.102 0.532 0.467 0)
@@ -33,8 +25,8 @@ all(compare["Paul Anka is cool"]~0.318 0.433 0 0.566;compare["Paul Anka is cool,
 all(compare["Jethro Tull is dorkier"]~-0.274 0 0.411 0.588;compare["But Jethro Tull is dorkier"]~-0.392 0 0.398 0.601)
 all(compare["Paul Anka is a dork"]~-0.34 0 0.444 0.555;compare["Paul Anka isn't a dork"]~.258 .404 0 0.595)
 all(compare["Paul Anka is a nerd"]~-0.296 0 0.423 0.576;compare["Paul Anka is kind of a nerd"]~-0.229 0 0.322 0.677)
-all(sent.score["Paul Anka is the GREATEST"][`compound`pos])>sent.score["Paul Anka is thegreatest"][`compound`pos]
-(sent.score["PAUL ANKA IS THE GREATEST"])~sent.score["Paul Anka is the greatest"]
+all(sentiment["Paul Anka is the GREATEST"][`compound`pos])>sentiment["Paul Anka is thegreatest"][`compound`pos]
+(sentiment["PAUL ANKA IS THE GREATEST"])~sentiment["Paul Anka is the greatest"]
 all(compare["中国 is beautiful"]~0.599 0.661 0 0.338;compare["Best φαλάφελ in Greece"]~0.636 0.583 0 0.416;compare["Paul Anka…king of the dorks"]~-0.129 0 0.23 0.769)
 compare["Paul Anka's singing is beautiful- especially Black Hole Sun"]~compare["Paul Anka's singing is beautiful especially Black Hole Sun"]
 \d .