Merge branch 'Dianeod-dev'

KxSystems · Jan 16, 2020 · 5f7ac52 · 5f7ac52
2 parents 82bed6b + df88c9d
commit 5f7ac52
Show file tree

Hide file tree

Showing 26 changed files with 440 additions and 310 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -27,7 +27,7 @@ install:
   - if [[ "x$QLIC_KC" != "x" ]]; then
       echo -n $QLIC_KC |base64 --decode > q/kc.lic;
       pip -q install -r requirements.txt;
-      python -m spacy download en;
+      python -m spacy download en; 
     fi
 beforescript:
   - IMPLEMENTATION=$(if [[ "x$TRAVIS_TAG" == "x" ]]; then echo $TRAVIS_BRANCH-$TRAVIS_COMMIT; else echo $TRAVIS_TAG; fi;)

diff --git a/README.md b/README.md
@@ -17,7 +17,9 @@ The NLP allows users to parse dataset using the spacy model from python in which
 The following python packages are required:
   1. numpy
   2. beautifulsoup4
-  3. spacy
+  3. spacy 
+
+* Tests were run using spacy version 2.2.1
 
 To install these packages with
 
@@ -27,30 +29,72 @@ pip install -r requirements.txt
 ```
 or with conda
 ```bash
-conda install --file requirements.txt
+conda install -c conda-forge --file requirements.txt
 ```
 
 * Download the English model using ```python -m spacy download en```
-
+
+Other languages that spacy supports can be found at https://spacy.io/usage/models#languages
+
+To use the languages in the alpha stage of developement in spacy the following steps can be taken:
+
+To Download the Chinese model the jieba must be installed
+
+pip
+```bash
+pip install jieba
+```
+
+To download the Japanese model mecab must be installed
+
+pip
+```bash
+pip install mecab-python3
+```
+
+* spacy_hunspell is not a requirement to run these scripts, but can be installed using the following methods
+
+Linux
+```bash
+sudo apt-get install libhunspell-dev hunspell
+pip install spacy_hunspell
+```
+
+mac
+```bash
+wget https://iweb.dl.sourceforge.net/project/wordlist/speller/2019.10.06/hunspell-en_US-2019.10.06.zip;
+unzip hunspell-en_US-2019.10.06; sudo mv en_US.dic en_US.aff /Library/Spelling/; 
+brew install hunspell;
+export C_INCLUDE_PATH=/usr/local/include/hunspell;
+sudo ln -sf /usr/local/lib/libhunspell-1.7.a /usr/local/lib/libhunspell.a;
+sudo ln -sf /usr/local/Cellar/hunspell/1.7.0_2/lib/libhunspell-1.7.dylib /usr/local/Cellar/hunspell/1.7.0_2/lib/libhunspell.dylib;
+CFLAGS=$(pkg-config --cflags hunspell) LDFLAGS=$(pkg-config --libs hunspell) pip install hunspell==0.5.0
+```
+
+At the moment spacy_hunspell does not support installation for windows. More information can be found at https://github.com/tokestermw/spacy_hunspell
+
 ## Installation
 Run tests with
 
 ```bash
 q test.q
 ```
 
-Place the library file in `$QHOME` and load `nlp/init.q`
+Place the library file in `$QHOME` and load into a q instance using 
+
 ```q
-q)\l nlp/init.q
-Loading utils.q
-Loading regex.q
-Loading sent.q
-Loading parser.q
-Loading time.q
-Loading date.q
-Loading email.q
-Loading cluster.q
-Loading nlp.q
+q)\l nlp/nlp.q
+q).nlp.loadfile`:init.q
+Loading init.q
+Loading code/utils.q
+Loading code/regex.q
+Loading code/sent.q
+Loading code/parser.q
+Loading code/time.q
+Loading code/date.q
+Loading code/email.q
+Loading code/cluster.q
+Loading code/nlp_code.q
 q).nlp.findTimes"I went to work at 9:00am and had a coffee at 10:20"
 09:00:00.000 "9:00am" 18 24
 10:20:00.000 "10:20"  45 50
@@ -73,15 +117,15 @@ If you have [Docker installed](https://www.docker.com/community-edition) you can
     KDB+ 3.5 2018.04.25 Copyright (C) 1993-2018 Kx Systems
     l64/ 4()core 7905MB kx 0123456789ab 172.17.0.2 EXPIRE 2018.12.04 [email protected] KOD #0000000
 
-    Loading utils.q
-    Loading regex.q
-    Loading sent.q
-    Loading parser.q
-    Loading time.q
-    Loading date.q
-    Loading email.q
-    Loading cluster.q
-    Loading nlp.q
+    Loading code/utils.q
+    Loading code/regex.q
+    Loading code/sent.q
+    Loading code/parser.q
+    Loading code/time.q
+    Loading code/date.q
+    Loading code/email.q
+    Loading code/cluster.q
+    Loading code/nlp_code.q
     q).nlp.findTimes"I went to work at 9:00am and had a coffee at 10:20"
     09:00:00.000 "9:00am" 18 24
     10:20:00.000 "10:20"  45 50
@@ -97,9 +141,7 @@ If you have [Docker installed](https://www.docker.com/community-edition) you can
 
 Documentation is available on the [nlp](https://code.kx.com/v2/ml/nlp/) homepage.
 
-
-
-
+
 
 ## Status
 

diff --git a/build/getembedpy.q b/build/getembedpy.q
@@ -1,5 +1,5 @@
 qhome:hsym`$$[not count u:getenv`QHOME;[-2"QHOME not defined";exit 1];u];
-dl:{[s;url]$[s;;`/:]system"curl -u ",getenv[`GH_APIREAD]," -s -L ",url,$[s;" -J -O";""]}
+dl:{[s;url]$[s;;`/:]system "curl -u ",getenv[`GH_APIREAD]," -s -L ",url,$[s;" -J -O";""]}
 download:{
  assets:.j.k[dl[0b]"https://api.github.com/repos/KxSystems/embedPy/releases/",$[not[count x]|x~"latest";"latest";"tags/",x]]`assets;
  relurl:first exec browser_download_url from assets where name like{"*",x,"*"}(`m64`l64`w64!string`osx`linux`windows).z.o;

diff --git a/cluster.q → code/cluster.q b/cluster.q → code/cluster.q
@@ -6,7 +6,7 @@ cluster.i.asKeywords:{i.fillEmptyDocs $[-9=type x[0]`keywords;x;x`keywords]}
 // Get cohesiveness of cluster as measured by mean sum of squares error
 cluster.MSE:{[docs]
   $[0=n:count docs;0n;1=n;1.;0=sum count each docs;0n;
-    avg d*d:0^i.compareDocToCentroid[i.takeTop[50]i.fastSum docs]each i.fillEmptyDocs docs]}
+    avg d*d:0^compareDocToCentroid[i.takeTop[50]i.fastSum docs]each i.fillEmptyDocs docs]}
 
 // Bisecting k-means algo (repeatedly splits largest cluster in 2)
 cluster.bisectingKMeans:{[docs;k;n]
@@ -24,7 +24,7 @@ cluster.kmeans:{[docs;k;n]
   }[docs]/(k;0N)#neg[nd]?nd:count docs:cluster.i.asKeywords docs}
 
 // Match each doc to nearest centroid
-cluster.i.groupByCentroids:{[centroids;docs]
+cluster.groupByCentroids:{[centroids;docs]
   value group{[centroids;doc]$[0<m:max s:compareDocs[doc]each centroids;s?m;0n]}[centroids]each docs}
 
 // Merge any clusters with significant overlap into a single cluster
@@ -66,7 +66,7 @@ cluster.MCL:{[docs;mn;sample]
   keywords:docs idx:$[sample;(neg"i"$sqrt count docs)?count docs;til count docs];
   similarities:i.matrixFromRaggedList i.compareDocToCorpus[keywords]each til count keywords;
   // Find all the clusters
-  clustersOfOne:1=count each clusters:cluster.similarityMatrix similarities>=mn;
+  clustersOfOne:1=count each clusters:cluster.i.similarityMatrix similarities>=mn;
   if[not sample;:clusters where not clustersOfOne];
   // Any cluster of 1 documents isn't a cluster, so throw it out
   outliers:raze clusters where clustersOfOne;
@@ -76,11 +76,11 @@ cluster.MCL:{[docs;mn;sample]
   centroids:avg each keywords clusters;
   // Move each non-outlier to the nearest centroid
   nonOutliers:(til count docs)except idx outliers;
-  nonOutliers cluster.i.groupByCentroids[centroids;docs nonOutliers]}
+  nonOutliers cluster.groupByCentroids[centroids;docs nonOutliers]}
 
 // Graph clustering that works on a similarity matrix
 cluster.i.columnNormalize:{[mat]0f^mat%\:sum mat}
-cluster.similarityMatrix:{[mat]
+cluster.i.similarityMatrix:{[mat]
   matrix:"f"$mat;
   // SM Van Dongen's MCL clustering algorithm
   MCL:{[mat]
@@ -105,4 +105,4 @@ cluster.summarize:{[docs;n]
     centroids,:nearest:i.maxIndex docs[;i.maxIndex summary];
     summary-:docs nearest;
     summary:(where summary<0)_ summary];
-  cluster.i.groupByCentroids[docs centroids;docs]}
+  cluster.groupByCentroids[docs centroids;docs]}
diff --git a/date.q → code/date_time.q b/date.q → code/date_time.q
@@ -1,38 +1,38 @@
 \d .nlp
 
 // Pad day string to 2 digits
-tm.parseDay:{-2#"0",x where x in .Q.n}
+tm.i.parseDay:{-2#"0",x where x in .Q.n}
 
 // Convert month string and pad to 2 digits
-tm.months:`jan`feb`mar`apr`may`jun`jul`aug`sep`oct`nov`dec!`$string 1+til 12
-tm.parseMonth:{-2#"0",string x^tm.months x:lower`$3 sublist x}
+tm.i.months:`jan`feb`mar`apr`may`jun`jul`aug`sep`oct`nov`dec!`$string 1+til 12
+tm.i.parseMonth:{-2#"0",string x^tm.i.months x:lower`$3 sublist x}
 
 // Pad year string to 4 digits (>35 deemed 1900s)
-tm.parseYear:{-4#$[35<"I"$-2#x;"19";"20"],x}
+tm.i.parseYear:{-4#$[35<"I"$-2#x;"19";"20"],x}
 
 // Convert year string to date range
-tm.convY:{"D"$x,/:(".01.01";".12.31")}
+tm.i.convY:{"D"$x,/:(".01.01";".12.31")}
 
 // Convert yearmonth string to date range
-tm.convYM:{
+tm.i.convYM:{
   matches:ungroup([fmt:"ym"]txt:regex.matchAll[;x]each regex.objects`year`month);
   matches:value select fmt,last txt by s from matches,'flip`txt`s`e!flip matches`txt;
   fmt:{@[x;where not xx;except[;raze x where xx:1=count each x]]}/[matches`fmt];
   fmt:raze@[fmt;i where 1<count each i:group fmt;:;" "];
-  0 -1+"d"$0 1+"M"$"."sv tm[`parseYear`parseMonth]@'matches[`txt]idesc fmt}
+  0 -1+"d"$0 1+"M"$"."sv tm.i[`parseYear`parseMonth]@'matches[`txt]idesc fmt}
 
 // Convert yearmonthday string to date range
-tm.convYMD:{
+tm.i.convYMD:{
   matches:ungroup([fmt:"ymd"]txt:regex.matchAll[;x]each regex.objects`year`month`day);
   matches:value select fmt,last txt by s from matches,'flip`txt`s`e!flip matches`txt;
   fmt:{@[x;i unq;:;"ymd" unq:where 1=count each i:where each "ymd" in/:\:x]}/[matches`fmt];
-  fmt:tm.resolveFormat raze@[fmt;where 1<count each fmt;:;" "];  
-  2#"D"$"."sv tm[`parseYear`parseMonth`parseDay]@'matches[`txt]idesc fmt}
+  fmt:tm.i.resolveFormat raze@[fmt;where 1<count each fmt;:;" "];  
+  2#"D"$"."sv tm.i[`parseYear`parseMonth`parseDay]@'matches[`txt]idesc fmt}
 
 // Fill in blanks in date format string
-tm.resolveFormat:{$[0=n:sum" "=x;;1=n;ssr[;" ";first"ymd"except x];2=n;tm.dateFormats;{"dmy"}]x}
+tm.i.resolveFormat:{$[0=n:sum" "=x;;1=n;ssr[;" ";first"ymd"except x];2=n;tm.i.dateFormats;{"dmy"}]x}
 
-tm.dateFormats:(!). flip( / fmt given single known position
+tm.i.dateFormats:(!). flip( / fmt given single known position
   ("d  ";"dmy"); // 2nd 12 12
   ("m  ";"mdy"); // Jan 12 12
   ("y  ";"ymd"); // 1999 12 12
@@ -43,13 +43,23 @@ tm.dateFormats:(!). flip( / fmt given single known position
   ("  m";"ydm"); // 12 12 Jan This is never conventionally used
   ("  y";"dmy")) // 12 12 1999 //mdy is the american option
 
+// Turns string matching time regex into a q time
+tm.i.parseTime:{
+  tm:"T"$x where vs[" ";x][0]in"1234567890:.";
+  ampm:regex.check[;x]each regex.objects`am`pm;
+  tm+$[ampm[0]&12=`hh$tm;-1;ampm[1]&12>`hh$tm;1;0]*12:00}
+
+
+// Find all times : list of 4-tuples (time; timeText; startIndex; 1+endIndex)
+tm.findTimes:{time:(tm.i.parseTime each tmtxt[;0]),'tmtxt:regex.matchAll[regex.objects.time;x]; time where time[;0]<24:01}
+
 // Find all dates : list of 5-tuples (startDate; endDate; dateText; startIndex; 1+endIndex)
 tm.findDates:{[text]
   rmInv:{x where not null x[;0]};
   ym:regex.matchAll[regex.objects.yearmonth;text];
   ymd:regex.matchAll[regex.objects.yearmonthday;text];
-  dts:rmInv(tm.convYMD each ymd[;0]),'ymd;
+  dts:rmInv(tm.i.convYMD each ymd[;0]),'ymd;
   if[count dts;ym@:where not any ym[;1] within/: dts[; 3 4]];
-  dts,:rmInv(tm.convYM each ym[;0]),'ym;
+  dts,:rmInv(tm.i.convYM each ym[;0]),'ym;
   dts iasc dts[;3]}
 
diff --git a/email.q → code/email.q b/email.q → code/email.q
@@ -1,11 +1,11 @@
 \d .nlp
 
 //Loading python script to extract rtf text
-system"l ",.nlp.path,"/","extract_rtf.p";
-striprtf:.p.get[`striprtf;<]
+system"l ",.nlp.path,"/","code/extract_rtf.p";
+i.striprtf:.p.get[`striprtf;<]
 
 // Read mbox file, convert to table, parse metadata & content
-email.i.getMboxText:{[fp]update text:.nlp.email.i.extractText each payload from email.i.parseMbox fp}
+email.getMboxText:{[fp]update text:.nlp.email.i.extractText each payload from email.i.parseMbox fp}
 
 email.i.findmime:{all(99=type each y`payload;x~/:y`contentType;0b~'y[`payload]@'`attachment)}
 email.i.html2text:{email.i.bs[x;"html.parser"][`:get_text;"\\n"]`} / extract text from html
@@ -16,7 +16,7 @@ email.i.extractText:{
    / use beautiful soup to extract text from html
    count i:where email.i.findmime["text/html"]x ;"\n\n"sv{email.i.html2text x[y][`payload]`content}[x]each i;
    / use python script to extract text from rtf
-   count i:where email.i.findmime["application/rtf"]x ;"\n\n"sv{striprtf x[y][`payload]`content}[x]each i;
+   count i:where email.i.findmime["application/rtf"]x ;"\n\n"sv{i.striprtf x[y][`payload]`content}[x]each i;
    "\n\n"sv .z.s each x`payload]}
 
 
@@ -31,7 +31,7 @@ email.i.getToFrom:{[msg]
 // Init python and q functions for reading mbox files
 email.i.parseMail:{email.i.parseMbox1 email.i.msgFromString[x]`.}
 email.i.parseMbox:{email.i.parseMbox1 each .p.list[<] .p.import[`mailbox;`:mbox]x}
-email.i.parseMbox1:{k!email.get[k:`sender`to`date`subject`contentType`payload]@\:.p.wrap x}
+email.i.parseMbox1:{k!email.get.i[k:`sender`to`date`subject`contentType`payload]@\:.p.wrap x}
 
 email.i.bs:.p.import[`bs4]`:BeautifulSoup
 email.i.getaddr:.p.import[`email.utils;`:getaddresses;<]
@@ -40,13 +40,13 @@ email.i.decodehdr:.p.import[`email.header;`:decode_header]
 email.i.makehdr:.p.import[`email.header;`:make_header]
 email.i.msgFromString:.p.import[`email]`:message_from_string
 
-email.get.sender:{email.i.getaddr e where not(::)~'e:raze x[`:get_all;<]each("from";"resent-from")}
-email.get.to:{email.i.getaddr e where not any(::;"")~/:\:e:raze x[`:get_all;<]each("to";"cc";"resent-to";"resent-cc")}
-email.get.date:{"P"$"D"sv".:"sv'3 cut{$[1=count x;"0";""],x}each string 6#email.i.parsedate x[@;`date]}
-email.get.subject:{$[(::)~(s:x[@;`subject])`;"";email.i.makehdr[email.i.decodehdr s][`:__str__][]`]}
-email.get.contentType:{x[`:get_content_type][]`}
+email.get.i.sender:{email.i.getaddr e where not(::)~'e:raze x[`:get_all;<]each("from";"resent-from")}
+email.get.i.to:{email.i.getaddr e where not any(::;"")~/:\:e:raze x[`:get_all;<]each("to";"cc";"resent-to";"resent-cc")}
+email.get.i.date:{"P"$"D"sv".:"sv'3 cut{$[1=count x;"0";""],x}each string 6#email.i.parsedate x[@;`date]}
+email.get.i.subject:{$[(::)~(s:x[@;`subject])`;"";email.i.makehdr[email.i.decodehdr s][`:__str__][]`]}
+email.get.i.contentType:{x[`:get_content_type][]`}
 / return a dict of `attachment`content or a table of payloads, content is byte[] for binary data, char[] for text
-email.get.payload:{
+email.get.i.payload:{
  if[x[`:is_multipart][]`;:email.i.parseMbox1 each x[`:get_payload][]`];
  raw:x[`:get_payload;`decode pykw 1]; / raw bytes decoded from base64 encoding, wrapped embedPy
  if[all("application/rtf"~(x[`:get_content_type][]`);"attachment"~x[`:get_content_disposition][]`);:`attachment`content!(0b;raw`)];

diff --git a/extract_rtf.p → code/extract_rtf.p b/extract_rtf.p → code/extract_rtf.p