Skip to content

Commit

Permalink
dod pr merge
Browse files Browse the repository at this point in the history
  • Loading branch information
awilson-kx committed Jan 16, 2020
2 parents 82bed6b + 65904a3 commit df88c9d
Show file tree
Hide file tree
Showing 26 changed files with 440 additions and 310 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ install:
- if [[ "x$QLIC_KC" != "x" ]]; then
echo -n $QLIC_KC |base64 --decode > q/kc.lic;
pip -q install -r requirements.txt;
python -m spacy download en;
python -m spacy download en;
fi
beforescript:
- IMPLEMENTATION=$(if [[ "x$TRAVIS_TAG" == "x" ]]; then echo $TRAVIS_BRANCH-$TRAVIS_COMMIT; else echo $TRAVIS_TAG; fi;)
Expand Down
94 changes: 68 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ The NLP allows users to parse dataset using the spacy model from python in which
The following python packages are required:
1. numpy
2. beautifulsoup4
3. spacy
3. spacy

* Tests were run using spacy version 2.2.1

To install these packages with

Expand All @@ -27,30 +29,72 @@ pip install -r requirements.txt
```
or with conda
```bash
conda install --file requirements.txt
conda install -c conda-forge --file requirements.txt
```

* Download the English model using ```python -m spacy download en```


Other languages that spacy supports can be found at https://spacy.io/usage/models#languages

To use the languages in the alpha stage of developement in spacy the following steps can be taken:

To Download the Chinese model the jieba must be installed

pip
```bash
pip install jieba
```

To download the Japanese model mecab must be installed

pip
```bash
pip install mecab-python3
```

* spacy_hunspell is not a requirement to run these scripts, but can be installed using the following methods

Linux
```bash
sudo apt-get install libhunspell-dev hunspell
pip install spacy_hunspell
```

mac
```bash
wget https://iweb.dl.sourceforge.net/project/wordlist/speller/2019.10.06/hunspell-en_US-2019.10.06.zip;
unzip hunspell-en_US-2019.10.06; sudo mv en_US.dic en_US.aff /Library/Spelling/;
brew install hunspell;
export C_INCLUDE_PATH=/usr/local/include/hunspell;
sudo ln -sf /usr/local/lib/libhunspell-1.7.a /usr/local/lib/libhunspell.a;
sudo ln -sf /usr/local/Cellar/hunspell/1.7.0_2/lib/libhunspell-1.7.dylib /usr/local/Cellar/hunspell/1.7.0_2/lib/libhunspell.dylib;
CFLAGS=$(pkg-config --cflags hunspell) LDFLAGS=$(pkg-config --libs hunspell) pip install hunspell==0.5.0
```

At the moment spacy_hunspell does not support installation for windows. More information can be found at https://github.com/tokestermw/spacy_hunspell

## Installation
Run tests with

```bash
q test.q
```

Place the library file in `$QHOME` and load `nlp/init.q`
Place the library file in `$QHOME` and load into a q instance using

```q
q)\l nlp/init.q
Loading utils.q
Loading regex.q
Loading sent.q
Loading parser.q
Loading time.q
Loading date.q
Loading email.q
Loading cluster.q
Loading nlp.q
q)\l nlp/nlp.q
q).nlp.loadfile`:init.q
Loading init.q
Loading code/utils.q
Loading code/regex.q
Loading code/sent.q
Loading code/parser.q
Loading code/time.q
Loading code/date.q
Loading code/email.q
Loading code/cluster.q
Loading code/nlp_code.q
q).nlp.findTimes"I went to work at 9:00am and had a coffee at 10:20"
09:00:00.000 "9:00am" 18 24
10:20:00.000 "10:20" 45 50
Expand All @@ -73,15 +117,15 @@ If you have [Docker installed](https://www.docker.com/community-edition) you can
KDB+ 3.5 2018.04.25 Copyright (C) 1993-2018 Kx Systems
l64/ 4()core 7905MB kx 0123456789ab 172.17.0.2 EXPIRE 2018.12.04 [email protected] KOD #0000000

Loading utils.q
Loading regex.q
Loading sent.q
Loading parser.q
Loading time.q
Loading date.q
Loading email.q
Loading cluster.q
Loading nlp.q
Loading code/utils.q
Loading code/regex.q
Loading code/sent.q
Loading code/parser.q
Loading code/time.q
Loading code/date.q
Loading code/email.q
Loading code/cluster.q
Loading code/nlp_code.q
q).nlp.findTimes"I went to work at 9:00am and had a coffee at 10:20"
09:00:00.000 "9:00am" 18 24
10:20:00.000 "10:20" 45 50
Expand All @@ -97,9 +141,7 @@ If you have [Docker installed](https://www.docker.com/community-edition) you can

Documentation is available on the [nlp](https://code.kx.com/v2/ml/nlp/) homepage.






## Status

Expand Down
2 changes: 1 addition & 1 deletion build/getembedpy.q
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
qhome:hsym`$$[not count u:getenv`QHOME;[-2"QHOME not defined";exit 1];u];
dl:{[s;url]$[s;;`/:]system"curl -u ",getenv[`GH_APIREAD]," -s -L ",url,$[s;" -J -O";""]}
dl:{[s;url]$[s;;`/:]system "curl -u ",getenv[`GH_APIREAD]," -s -L ",url,$[s;" -J -O";""]}
download:{
assets:.j.k[dl[0b]"https://api.github.com/repos/KxSystems/embedPy/releases/",$[not[count x]|x~"latest";"latest";"tags/",x]]`assets;
relurl:first exec browser_download_url from assets where name like{"*",x,"*"}(`m64`l64`w64!string`osx`linux`windows).z.o;
Expand Down
12 changes: 6 additions & 6 deletions cluster.q → code/cluster.q
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ cluster.i.asKeywords:{i.fillEmptyDocs $[-9=type x[0]`keywords;x;x`keywords]}
// Get cohesiveness of cluster as measured by mean sum of squares error
cluster.MSE:{[docs]
$[0=n:count docs;0n;1=n;1.;0=sum count each docs;0n;
avg d*d:0^i.compareDocToCentroid[i.takeTop[50]i.fastSum docs]each i.fillEmptyDocs docs]}
avg d*d:0^compareDocToCentroid[i.takeTop[50]i.fastSum docs]each i.fillEmptyDocs docs]}

// Bisecting k-means algo (repeatedly splits largest cluster in 2)
cluster.bisectingKMeans:{[docs;k;n]
Expand All @@ -24,7 +24,7 @@ cluster.kmeans:{[docs;k;n]
}[docs]/(k;0N)#neg[nd]?nd:count docs:cluster.i.asKeywords docs}

// Match each doc to nearest centroid
cluster.i.groupByCentroids:{[centroids;docs]
cluster.groupByCentroids:{[centroids;docs]
value group{[centroids;doc]$[0<m:max s:compareDocs[doc]each centroids;s?m;0n]}[centroids]each docs}

// Merge any clusters with significant overlap into a single cluster
Expand Down Expand Up @@ -66,7 +66,7 @@ cluster.MCL:{[docs;mn;sample]
keywords:docs idx:$[sample;(neg"i"$sqrt count docs)?count docs;til count docs];
similarities:i.matrixFromRaggedList i.compareDocToCorpus[keywords]each til count keywords;
// Find all the clusters
clustersOfOne:1=count each clusters:cluster.similarityMatrix similarities>=mn;
clustersOfOne:1=count each clusters:cluster.i.similarityMatrix similarities>=mn;
if[not sample;:clusters where not clustersOfOne];
// Any cluster of 1 documents isn't a cluster, so throw it out
outliers:raze clusters where clustersOfOne;
Expand All @@ -76,11 +76,11 @@ cluster.MCL:{[docs;mn;sample]
centroids:avg each keywords clusters;
// Move each non-outlier to the nearest centroid
nonOutliers:(til count docs)except idx outliers;
nonOutliers cluster.i.groupByCentroids[centroids;docs nonOutliers]}
nonOutliers cluster.groupByCentroids[centroids;docs nonOutliers]}

// Graph clustering that works on a similarity matrix
cluster.i.columnNormalize:{[mat]0f^mat%\:sum mat}
cluster.similarityMatrix:{[mat]
cluster.i.similarityMatrix:{[mat]
matrix:"f"$mat;
// SM Van Dongen's MCL clustering algorithm
MCL:{[mat]
Expand All @@ -105,4 +105,4 @@ cluster.summarize:{[docs;n]
centroids,:nearest:i.maxIndex docs[;i.maxIndex summary];
summary-:docs nearest;
summary:(where summary<0)_ summary];
cluster.i.groupByCentroids[docs centroids;docs]}
cluster.groupByCentroids[docs centroids;docs]}
38 changes: 24 additions & 14 deletions date.q → code/date_time.q
Original file line number Diff line number Diff line change
@@ -1,38 +1,38 @@
\d .nlp

// Pad day string to 2 digits
tm.parseDay:{-2#"0",x where x in .Q.n}
tm.i.parseDay:{-2#"0",x where x in .Q.n}

// Convert month string and pad to 2 digits
tm.months:`jan`feb`mar`apr`may`jun`jul`aug`sep`oct`nov`dec!`$string 1+til 12
tm.parseMonth:{-2#"0",string x^tm.months x:lower`$3 sublist x}
tm.i.months:`jan`feb`mar`apr`may`jun`jul`aug`sep`oct`nov`dec!`$string 1+til 12
tm.i.parseMonth:{-2#"0",string x^tm.i.months x:lower`$3 sublist x}

// Pad year string to 4 digits (>35 deemed 1900s)
tm.parseYear:{-4#$[35<"I"$-2#x;"19";"20"],x}
tm.i.parseYear:{-4#$[35<"I"$-2#x;"19";"20"],x}

// Convert year string to date range
tm.convY:{"D"$x,/:(".01.01";".12.31")}
tm.i.convY:{"D"$x,/:(".01.01";".12.31")}

// Convert yearmonth string to date range
tm.convYM:{
tm.i.convYM:{
matches:ungroup([fmt:"ym"]txt:regex.matchAll[;x]each regex.objects`year`month);
matches:value select fmt,last txt by s from matches,'flip`txt`s`e!flip matches`txt;
fmt:{@[x;where not xx;except[;raze x where xx:1=count each x]]}/[matches`fmt];
fmt:raze@[fmt;i where 1<count each i:group fmt;:;" "];
0 -1+"d"$0 1+"M"$"."sv tm[`parseYear`parseMonth]@'matches[`txt]idesc fmt}
0 -1+"d"$0 1+"M"$"."sv tm.i[`parseYear`parseMonth]@'matches[`txt]idesc fmt}

// Convert yearmonthday string to date range
tm.convYMD:{
tm.i.convYMD:{
matches:ungroup([fmt:"ymd"]txt:regex.matchAll[;x]each regex.objects`year`month`day);
matches:value select fmt,last txt by s from matches,'flip`txt`s`e!flip matches`txt;
fmt:{@[x;i unq;:;"ymd" unq:where 1=count each i:where each "ymd" in/:\:x]}/[matches`fmt];
fmt:tm.resolveFormat raze@[fmt;where 1<count each fmt;:;" "];
2#"D"$"."sv tm[`parseYear`parseMonth`parseDay]@'matches[`txt]idesc fmt}
fmt:tm.i.resolveFormat raze@[fmt;where 1<count each fmt;:;" "];
2#"D"$"."sv tm.i[`parseYear`parseMonth`parseDay]@'matches[`txt]idesc fmt}

// Fill in blanks in date format string
tm.resolveFormat:{$[0=n:sum" "=x;;1=n;ssr[;" ";first"ymd"except x];2=n;tm.dateFormats;{"dmy"}]x}
tm.i.resolveFormat:{$[0=n:sum" "=x;;1=n;ssr[;" ";first"ymd"except x];2=n;tm.i.dateFormats;{"dmy"}]x}

tm.dateFormats:(!). flip( / fmt given single known position
tm.i.dateFormats:(!). flip( / fmt given single known position
("d ";"dmy"); // 2nd 12 12
("m ";"mdy"); // Jan 12 12
("y ";"ymd"); // 1999 12 12
Expand All @@ -43,13 +43,23 @@ tm.dateFormats:(!). flip( / fmt given single known position
(" m";"ydm"); // 12 12 Jan This is never conventionally used
(" y";"dmy")) // 12 12 1999 //mdy is the american option

// Turns string matching time regex into a q time
tm.i.parseTime:{
tm:"T"$x where vs[" ";x][0]in"1234567890:.";
ampm:regex.check[;x]each regex.objects`am`pm;
tm+$[ampm[0]&12=`hh$tm;-1;ampm[1]&12>`hh$tm;1;0]*12:00}


// Find all times : list of 4-tuples (time; timeText; startIndex; 1+endIndex)
tm.findTimes:{time:(tm.i.parseTime each tmtxt[;0]),'tmtxt:regex.matchAll[regex.objects.time;x]; time where time[;0]<24:01}

// Find all dates : list of 5-tuples (startDate; endDate; dateText; startIndex; 1+endIndex)
tm.findDates:{[text]
rmInv:{x where not null x[;0]};
ym:regex.matchAll[regex.objects.yearmonth;text];
ymd:regex.matchAll[regex.objects.yearmonthday;text];
dts:rmInv(tm.convYMD each ymd[;0]),'ymd;
dts:rmInv(tm.i.convYMD each ymd[;0]),'ymd;
if[count dts;ym@:where not any ym[;1] within/: dts[; 3 4]];
dts,:rmInv(tm.convYM each ym[;0]),'ym;
dts,:rmInv(tm.i.convYM each ym[;0]),'ym;
dts iasc dts[;3]}

22 changes: 11 additions & 11 deletions email.q → code/email.q
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
\d .nlp

//Loading python script to extract rtf text
system"l ",.nlp.path,"/","extract_rtf.p";
striprtf:.p.get[`striprtf;<]
system"l ",.nlp.path,"/","code/extract_rtf.p";
i.striprtf:.p.get[`striprtf;<]

// Read mbox file, convert to table, parse metadata & content
email.i.getMboxText:{[fp]update text:.nlp.email.i.extractText each payload from email.i.parseMbox fp}
email.getMboxText:{[fp]update text:.nlp.email.i.extractText each payload from email.i.parseMbox fp}

email.i.findmime:{all(99=type each y`payload;x~/:y`contentType;0b~'y[`payload]@'`attachment)}
email.i.html2text:{email.i.bs[x;"html.parser"][`:get_text;"\\n"]`} / extract text from html
Expand All @@ -16,7 +16,7 @@ email.i.extractText:{
/ use beautiful soup to extract text from html
count i:where email.i.findmime["text/html"]x ;"\n\n"sv{email.i.html2text x[y][`payload]`content}[x]each i;
/ use python script to extract text from rtf
count i:where email.i.findmime["application/rtf"]x ;"\n\n"sv{striprtf x[y][`payload]`content}[x]each i;
count i:where email.i.findmime["application/rtf"]x ;"\n\n"sv{i.striprtf x[y][`payload]`content}[x]each i;
"\n\n"sv .z.s each x`payload]}


Expand All @@ -31,7 +31,7 @@ email.i.getToFrom:{[msg]
// Init python and q functions for reading mbox files
email.i.parseMail:{email.i.parseMbox1 email.i.msgFromString[x]`.}
email.i.parseMbox:{email.i.parseMbox1 each .p.list[<] .p.import[`mailbox;`:mbox]x}
email.i.parseMbox1:{k!email.get[k:`sender`to`date`subject`contentType`payload]@\:.p.wrap x}
email.i.parseMbox1:{k!email.get.i[k:`sender`to`date`subject`contentType`payload]@\:.p.wrap x}

email.i.bs:.p.import[`bs4]`:BeautifulSoup
email.i.getaddr:.p.import[`email.utils;`:getaddresses;<]
Expand All @@ -40,13 +40,13 @@ email.i.decodehdr:.p.import[`email.header;`:decode_header]
email.i.makehdr:.p.import[`email.header;`:make_header]
email.i.msgFromString:.p.import[`email]`:message_from_string

email.get.sender:{email.i.getaddr e where not(::)~'e:raze x[`:get_all;<]each("from";"resent-from")}
email.get.to:{email.i.getaddr e where not any(::;"")~/:\:e:raze x[`:get_all;<]each("to";"cc";"resent-to";"resent-cc")}
email.get.date:{"P"$"D"sv".:"sv'3 cut{$[1=count x;"0";""],x}each string 6#email.i.parsedate x[@;`date]}
email.get.subject:{$[(::)~(s:x[@;`subject])`;"";email.i.makehdr[email.i.decodehdr s][`:__str__][]`]}
email.get.contentType:{x[`:get_content_type][]`}
email.get.i.sender:{email.i.getaddr e where not(::)~'e:raze x[`:get_all;<]each("from";"resent-from")}
email.get.i.to:{email.i.getaddr e where not any(::;"")~/:\:e:raze x[`:get_all;<]each("to";"cc";"resent-to";"resent-cc")}
email.get.i.date:{"P"$"D"sv".:"sv'3 cut{$[1=count x;"0";""],x}each string 6#email.i.parsedate x[@;`date]}
email.get.i.subject:{$[(::)~(s:x[@;`subject])`;"";email.i.makehdr[email.i.decodehdr s][`:__str__][]`]}
email.get.i.contentType:{x[`:get_content_type][]`}
/ return a dict of `attachment`content or a table of payloads, content is byte[] for binary data, char[] for text
email.get.payload:{
email.get.i.payload:{
if[x[`:is_multipart][]`;:email.i.parseMbox1 each x[`:get_payload][]`];
raw:x[`:get_payload;`decode pykw 1]; / raw bytes decoded from base64 encoding, wrapped embedPy
if[all("application/rtf"~(x[`:get_content_type][]`);"attachment"~x[`:get_content_disposition][]`);:`attachment`content!(0b;raw`)];
Expand Down
File renamed without changes.
Loading

0 comments on commit df88c9d

Please sign in to comment.