diff --git a/.travis.yml b/.travis.yml index 1ad3f52..d72bb68 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,7 +27,7 @@ install: - if [[ "x$QLIC_KC" != "x" ]]; then echo -n $QLIC_KC |base64 --decode > q/kc.lic; pip -q install -r requirements.txt; - python -m spacy download en; + python -m spacy download en; fi beforescript: - IMPLEMENTATION=$(if [[ "x$TRAVIS_TAG" == "x" ]]; then echo $TRAVIS_BRANCH-$TRAVIS_COMMIT; else echo $TRAVIS_TAG; fi;) diff --git a/README.md b/README.md index a9d3821..f0e0e7a 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,9 @@ The NLP allows users to parse dataset using the spacy model from python in which The following python packages are required: 1. numpy 2. beautifulsoup4 - 3. spacy + 3. spacy + +* Tests were run using spacy version 2.2.1 To install these packages with @@ -27,11 +29,50 @@ pip install -r requirements.txt ``` or with conda ```bash -conda install --file requirements.txt +conda install -c conda-forge --file requirements.txt ``` * Download the English model using ```python -m spacy download en``` - + +Other languages that spacy supports can be found at https://spacy.io/usage/models#languages + +To use the languages in the alpha stage of developement in spacy the following steps can be taken: + +To Download the Chinese model the jieba must be installed + +pip +```bash +pip install jieba +``` + +To download the Japanese model mecab must be installed + +pip +```bash +pip install mecab-python3 +``` + +* spacy_hunspell is not a requirement to run these scripts, but can be installed using the following methods + +Linux +```bash +sudo apt-get install libhunspell-dev hunspell +pip install spacy_hunspell +``` + +mac +```bash +wget https://iweb.dl.sourceforge.net/project/wordlist/speller/2019.10.06/hunspell-en_US-2019.10.06.zip; +unzip hunspell-en_US-2019.10.06; sudo mv en_US.dic en_US.aff /Library/Spelling/; +brew install hunspell; +export C_INCLUDE_PATH=/usr/local/include/hunspell; +sudo ln -sf /usr/local/lib/libhunspell-1.7.a /usr/local/lib/libhunspell.a; +sudo ln -sf /usr/local/Cellar/hunspell/1.7.0_2/lib/libhunspell-1.7.dylib /usr/local/Cellar/hunspell/1.7.0_2/lib/libhunspell.dylib; +CFLAGS=$(pkg-config --cflags hunspell) LDFLAGS=$(pkg-config --libs hunspell) pip install hunspell==0.5.0 +``` + +At the moment spacy_hunspell does not support installation for windows. More information can be found at https://github.com/tokestermw/spacy_hunspell + ## Installation Run tests with @@ -39,18 +80,21 @@ Run tests with q test.q ``` -Place the library file in `$QHOME` and load `nlp/init.q` +Place the library file in `$QHOME` and load into a q instance using + ```q -q)\l nlp/init.q -Loading utils.q -Loading regex.q -Loading sent.q -Loading parser.q -Loading time.q -Loading date.q -Loading email.q -Loading cluster.q -Loading nlp.q +q)\l nlp/nlp.q +q).nlp.loadfile`:init.q +Loading init.q +Loading code/utils.q +Loading code/regex.q +Loading code/sent.q +Loading code/parser.q +Loading code/time.q +Loading code/date.q +Loading code/email.q +Loading code/cluster.q +Loading code/nlp_code.q q).nlp.findTimes"I went to work at 9:00am and had a coffee at 10:20" 09:00:00.000 "9:00am" 18 24 10:20:00.000 "10:20" 45 50 @@ -73,15 +117,15 @@ If you have [Docker installed](https://www.docker.com/community-edition) you can KDB+ 3.5 2018.04.25 Copyright (C) 1993-2018 Kx Systems l64/ 4()core 7905MB kx 0123456789ab 172.17.0.2 EXPIRE 2018.12.04 bob@example.com KOD #0000000 - Loading utils.q - Loading regex.q - Loading sent.q - Loading parser.q - Loading time.q - Loading date.q - Loading email.q - Loading cluster.q - Loading nlp.q + Loading code/utils.q + Loading code/regex.q + Loading code/sent.q + Loading code/parser.q + Loading code/time.q + Loading code/date.q + Loading code/email.q + Loading code/cluster.q + Loading code/nlp_code.q q).nlp.findTimes"I went to work at 9:00am and had a coffee at 10:20" 09:00:00.000 "9:00am" 18 24 10:20:00.000 "10:20" 45 50 @@ -97,9 +141,7 @@ If you have [Docker installed](https://www.docker.com/community-edition) you can Documentation is available on the [nlp](https://code.kx.com/v2/ml/nlp/) homepage. - - - + ## Status diff --git a/build/getembedpy.q b/build/getembedpy.q index bc802b8..fe30fc6 100644 --- a/build/getembedpy.q +++ b/build/getembedpy.q @@ -1,5 +1,5 @@ qhome:hsym`$$[not count u:getenv`QHOME;[-2"QHOME not defined";exit 1];u]; -dl:{[s;url]$[s;;`/:]system"curl -u ",getenv[`GH_APIREAD]," -s -L ",url,$[s;" -J -O";""]} +dl:{[s;url]$[s;;`/:]system "curl -u ",getenv[`GH_APIREAD]," -s -L ",url,$[s;" -J -O";""]} download:{ assets:.j.k[dl[0b]"https://api.github.com/repos/KxSystems/embedPy/releases/",$[not[count x]|x~"latest";"latest";"tags/",x]]`assets; relurl:first exec browser_download_url from assets where name like{"*",x,"*"}(`m64`l64`w64!string`osx`linux`windows).z.o; diff --git a/cluster.q b/code/cluster.q similarity index 92% rename from cluster.q rename to code/cluster.q index 2c60e3f..3f981fb 100644 --- a/cluster.q +++ b/code/cluster.q @@ -6,7 +6,7 @@ cluster.i.asKeywords:{i.fillEmptyDocs $[-9=type x[0]`keywords;x;x`keywords]} // Get cohesiveness of cluster as measured by mean sum of squares error cluster.MSE:{[docs] $[0=n:count docs;0n;1=n;1.;0=sum count each docs;0n; - avg d*d:0^i.compareDocToCentroid[i.takeTop[50]i.fastSum docs]each i.fillEmptyDocs docs]} + avg d*d:0^compareDocToCentroid[i.takeTop[50]i.fastSum docs]each i.fillEmptyDocs docs]} // Bisecting k-means algo (repeatedly splits largest cluster in 2) cluster.bisectingKMeans:{[docs;k;n] @@ -24,7 +24,7 @@ cluster.kmeans:{[docs;k;n] }[docs]/(k;0N)#neg[nd]?nd:count docs:cluster.i.asKeywords docs} // Match each doc to nearest centroid -cluster.i.groupByCentroids:{[centroids;docs] +cluster.groupByCentroids:{[centroids;docs] value group{[centroids;doc]$[0=mn; + clustersOfOne:1=count each clusters:cluster.i.similarityMatrix similarities>=mn; if[not sample;:clusters where not clustersOfOne]; // Any cluster of 1 documents isn't a cluster, so throw it out outliers:raze clusters where clustersOfOne; @@ -76,11 +76,11 @@ cluster.MCL:{[docs;mn;sample] centroids:avg each keywords clusters; // Move each non-outlier to the nearest centroid nonOutliers:(til count docs)except idx outliers; - nonOutliers cluster.i.groupByCentroids[centroids;docs nonOutliers]} + nonOutliers cluster.groupByCentroids[centroids;docs nonOutliers]} // Graph clustering that works on a similarity matrix cluster.i.columnNormalize:{[mat]0f^mat%\:sum mat} -cluster.similarityMatrix:{[mat] +cluster.i.similarityMatrix:{[mat] matrix:"f"$mat; // SM Van Dongen's MCL clustering algorithm MCL:{[mat] @@ -105,4 +105,4 @@ cluster.summarize:{[docs;n] centroids,:nearest:i.maxIndex docs[;i.maxIndex summary]; summary-:docs nearest; summary:(where summary<0)_ summary]; - cluster.i.groupByCentroids[docs centroids;docs]} + cluster.groupByCentroids[docs centroids;docs]} diff --git a/date.q b/code/date_time.q similarity index 57% rename from date.q rename to code/date_time.q index d05555e..9bfa945 100644 --- a/date.q +++ b/code/date_time.q @@ -1,38 +1,38 @@ \d .nlp // Pad day string to 2 digits -tm.parseDay:{-2#"0",x where x in .Q.n} +tm.i.parseDay:{-2#"0",x where x in .Q.n} // Convert month string and pad to 2 digits -tm.months:`jan`feb`mar`apr`may`jun`jul`aug`sep`oct`nov`dec!`$string 1+til 12 -tm.parseMonth:{-2#"0",string x^tm.months x:lower`$3 sublist x} +tm.i.months:`jan`feb`mar`apr`may`jun`jul`aug`sep`oct`nov`dec!`$string 1+til 12 +tm.i.parseMonth:{-2#"0",string x^tm.i.months x:lower`$3 sublist x} // Pad year string to 4 digits (>35 deemed 1900s) -tm.parseYear:{-4#$[35<"I"$-2#x;"19";"20"],x} +tm.i.parseYear:{-4#$[35<"I"$-2#x;"19";"20"],x} // Convert year string to date range -tm.convY:{"D"$x,/:(".01.01";".12.31")} +tm.i.convY:{"D"$x,/:(".01.01";".12.31")} // Convert yearmonth string to date range -tm.convYM:{ +tm.i.convYM:{ matches:ungroup([fmt:"ym"]txt:regex.matchAll[;x]each regex.objects`year`month); matches:value select fmt,last txt by s from matches,'flip`txt`s`e!flip matches`txt; fmt:{@[x;where not xx;except[;raze x where xx:1=count each x]]}/[matches`fmt]; fmt:raze@[fmt;i where 1`hh$tm;1;0]*12:00} + + +// Find all times : list of 4-tuples (time; timeText; startIndex; 1+endIndex) +tm.findTimes:{time:(tm.i.parseTime each tmtxt[;0]),'tmtxt:regex.matchAll[regex.objects.time;x]; time where time[;0]<24:01} + // Find all dates : list of 5-tuples (startDate; endDate; dateText; startIndex; 1+endIndex) tm.findDates:{[text] rmInv:{x where not null x[;0]}; ym:regex.matchAll[regex.objects.yearmonth;text]; ymd:regex.matchAll[regex.objects.yearmonthday;text]; - dts:rmInv(tm.convYMD each ymd[;0]),'ymd; + dts:rmInv(tm.i.convYMD each ymd[;0]),'ymd; if[count dts;ym@:where not any ym[;1] within/: dts[; 3 4]]; - dts,:rmInv(tm.convYM each ym[;0]),'ym; + dts,:rmInv(tm.i.convYM each ym[;0]),'ym; dts iasc dts[;3]} diff --git a/email.q b/code/email.q similarity index 76% rename from email.q rename to code/email.q index 9c60cb3..490e4e9 100644 --- a/email.q +++ b/code/email.q @@ -1,11 +1,11 @@ \d .nlp //Loading python script to extract rtf text -system"l ",.nlp.path,"/","extract_rtf.p"; -striprtf:.p.get[`striprtf;<] +system"l ",.nlp.path,"/","code/extract_rtf.p"; +i.striprtf:.p.get[`striprtf;<] // Read mbox file, convert to table, parse metadata & content -email.i.getMboxText:{[fp]update text:.nlp.email.i.extractText each payload from email.i.parseMbox fp} +email.getMboxText:{[fp]update text:.nlp.email.i.extractText each payload from email.i.parseMbox fp} email.i.findmime:{all(99=type each y`payload;x~/:y`contentType;0b~'y[`payload]@'`attachment)} email.i.html2text:{email.i.bs[x;"html.parser"][`:get_text;"\\n"]`} / extract text from html @@ -16,7 +16,7 @@ email.i.extractText:{ / use beautiful soup to extract text from html count i:where email.i.findmime["text/html"]x ;"\n\n"sv{email.i.html2text x[y][`payload]`content}[x]each i; / use python script to extract text from rtf - count i:where email.i.findmime["application/rtf"]x ;"\n\n"sv{striprtf x[y][`payload]`content}[x]each i; + count i:where email.i.findmime["application/rtf"]x ;"\n\n"sv{i.striprtf x[y][`payload]`content}[x]each i; "\n\n"sv .z.s each x`payload]} @@ -31,7 +31,7 @@ email.i.getToFrom:{[msg] // Init python and q functions for reading mbox files email.i.parseMail:{email.i.parseMbox1 email.i.msgFromString[x]`.} email.i.parseMbox:{email.i.parseMbox1 each .p.list[<] .p.import[`mailbox;`:mbox]x} -email.i.parseMbox1:{k!email.get[k:`sender`to`date`subject`contentType`payload]@\:.p.wrap x} +email.i.parseMbox1:{k!email.get.i[k:`sender`to`date`subject`contentType`payload]@\:.p.wrap x} email.i.bs:.p.import[`bs4]`:BeautifulSoup email.i.getaddr:.p.import[`email.utils;`:getaddresses;<] @@ -40,13 +40,13 @@ email.i.decodehdr:.p.import[`email.header;`:decode_header] email.i.makehdr:.p.import[`email.header;`:make_header] email.i.msgFromString:.p.import[`email]`:message_from_string -email.get.sender:{email.i.getaddr e where not(::)~'e:raze x[`:get_all;<]each("from";"resent-from")} -email.get.to:{email.i.getaddr e where not any(::;"")~/:\:e:raze x[`:get_all;<]each("to";"cc";"resent-to";"resent-cc")} -email.get.date:{"P"$"D"sv".:"sv'3 cut{$[1=count x;"0";""],x}each string 6#email.i.parsedate x[@;`date]} -email.get.subject:{$[(::)~(s:x[@;`subject])`;"";email.i.makehdr[email.i.decodehdr s][`:__str__][]`]} -email.get.contentType:{x[`:get_content_type][]`} +email.get.i.sender:{email.i.getaddr e where not(::)~'e:raze x[`:get_all;<]each("from";"resent-from")} +email.get.i.to:{email.i.getaddr e where not any(::;"")~/:\:e:raze x[`:get_all;<]each("to";"cc";"resent-to";"resent-cc")} +email.get.i.date:{"P"$"D"sv".:"sv'3 cut{$[1=count x;"0";""],x}each string 6#email.i.parsedate x[@;`date]} +email.get.i.subject:{$[(::)~(s:x[@;`subject])`;"";email.i.makehdr[email.i.decodehdr s][`:__str__][]`]} +email.get.i.contentType:{x[`:get_content_type][]`} / return a dict of `attachment`content or a table of payloads, content is byte[] for binary data, char[] for text -email.get.payload:{ +email.get.i.payload:{ if[x[`:is_multipart][]`;:email.i.parseMbox1 each x[`:get_payload][]`]; raw:x[`:get_payload;`decode pykw 1]; / raw bytes decoded from base64 encoding, wrapped embedPy if[all("application/rtf"~(x[`:get_content_type][]`);"attachment"~x[`:get_content_disposition][]`);:`attachment`content!(0b;raw`)]; diff --git a/extract_rtf.p b/code/extract_rtf.p similarity index 100% rename from extract_rtf.p rename to code/extract_rtf.p diff --git a/code/nlp_code.q b/code/nlp_code.q new file mode 100644 index 0000000..ca361b5 --- /dev/null +++ b/code/nlp_code.q @@ -0,0 +1,163 @@ +\d .nlp + +// Date-Time + +// Find all dates : list of 5-tuples (startDate; endDate; dateText; startIndex; 1+endIndex) +findDates:tm.findDates + +// Find all times : list of 4-tuples (time; timeText; startIndex; 1+endIndex) +findTimes:tm.findTimes + +// Email + +// Read mbox file, convert to table, parse metadata & content +email.loadEmails:loadEmails:email.getMboxText + +// Graph of who emailed whom, inc number of mails +email.getGraph:{[msgs] + 0!`volume xdesc select volume:count i by sender,to from flip`sender`to!flip`$raze email.i.getToFrom each msgs} + +email.parseMail:email.i.parseMail + +// Sentiment + +// Calculate sentiment of sentence of short message +sentiment:sent.score + +// Comparing docs/terms + +// Give 2 dicts of each term's affinity to each corpus +// Algorithm from Rayson, Paul, and Roger Garside. "Comparing corpora using frequency profiling." +// Proceedings of the workshop on Comparing Corpora. Association for Computational Linguistics, 2000 +compareCorpora:{[corp1;corp2] + if[(not count corp1)|(not count corp2);:((`$())!();(`$())!())]; + getTermCount:{[corp] + i.fastSum{1+log count each group x}each corp[`tokens]@'where each not corp`isStop}; + totalWordCountA:sum termCountA:getTermCount corp1; + totalWordCountB:sum termCountB:getTermCount corp2; + // The expected termCount of each term in each corpus + coef:(termCountA+termCountB)%(totalWordCountA+totalWordCountB); + expectedA:totalWordCountA*coef; + expectedB:totalWordCountB*coef; + // Return the differences between the corpora + (desc termCountA*log termCountA%expectedA;desc termCountB*log termCountB%expectedB)} + +// Calc cosine similarity of two docs +compareDocs:{cosineSimilarity .(x;y)@\:distinct raze key each(x;y)} + +// Compare similarity of 2 vectors +cosineSimilarity:{sum[x*y]%(sqrt sum x*x)*sqrt sum y*y} + +// How much each term contributes to the cosine similarity +explainSimilarity:{[doc1;doc2] + alignedKeys:inter[key doc1;key doc2]; + doc1@:alignedKeys; + doc2@:alignedKeys; + product:(doc2%i.magnitude doc1)*(doc2%i.magnitude doc2); + desc alignedKeys!product%sum product} + +// Cosine similarity of doc and centroid +compareDocToCentroid:{[centroid;doc] + doc@:alignedKeys:distinct key[centroid],key doc; + cosineSimilarity[doc;centroid[alignedKeys]-doc]} + +// Calc cosine similarity between doc and entire corpus +compareDocToCorpus:i.compareDocToCorpus + +// Jaro-Winkler distance between 2 strings +jaroWinkler:{i.jaroWinkler[lower x;lower y]} + +// Feature Vectors + +// Generate feature vector (of stemmed tokens) for a term +findRelatedTerms:{[docs;term] + sent:raze docs[`sentIndices]cut'@'[docs[`tokens];where each docs`isStop;:;`]; + sent@:asc distinct raze 0|-1 0 1+\:where(term:lower term)in/:sent; + ccur:` _ count each group raze distinct each sent; + tcur:idx@'group each docs[`tokens]@'idx:where each docs[`tokens]in\:key ccur; + tcur:i.fastSum((count distinct@)each)each docs[`sentIndices]bin'tcur; + ccur%:tcur term; + tcur%:sum count each docs`sentIndices; + desc except[where r>0;term]#r:(ccur-tcur)%sqrt tcur*1-tcur} + +// Find runs containing term where each word has above average co-ocurrance with term +extractPhrases:{[corpus;term] + relevant:term,sublist[150]where 01)#r:count each group r where term in/:r:raze tokens@'runs} + +// On a conceptually single doc (e.g. novel), gives better results than TF-IDF +// This algorithm is explained in the paper +// Carpena, P., et al. "Level statistics of words: Finding keywords in literary texts and symbolic sequences." +// Physical Review E 79.3 (2009): 035102. +keywordsContinuous:{[docs] + n:count each gt:group text:raze docs[`tokens]@'where each not docs`isStop; + words:where n>=4|.00002*count text; + dist:deltas each words#gt; + sigma:(dev each dist)%(avg each dist)*sqrt 1-(n:words#n)%count text; + std_sigma:1%sqrt[n]*1+2.8*n xexp -0.865; + chev_sigma:((2*n)-1)%2*n+1; + desc(sigma-chev_sigma)%std_sigma} + +// Find TFIDF scores for all terms in all documents +TFIDF:{[corpus] + tokens:corpus[`tokens]@'where each not corpus[`isStop]|corpus[`tokens]like\:"[0-9]*"; + tab:{x!{sum[x in y]%count x}[y]each x}'[words:distinct each tokens;tokens]; + tab*idf:1+log count[tokens]%{sum{x in y}[y]each x}[tokens]each words} + +TFIDF_tot:{[corpus]desc sum t%'sum each t:TFIDF corpus} + +// Parse Data + +// Create a new parser using a spaCy model (must already be installed) +newParser:parser.newParser + +// Parse urls to dictionaries +parseURLs:{`scheme`domainName`path`parameters`query`fragment!i.parseURLs x} + +// Exploratory Analysis + +// Find runs of tokens whose POS tags are in the set passed in +// Returns pair (text; firstIndex) +findPOSRuns:{[tagType;tags;doc] + start:where 1=deltas matchingTag:doc[tagType]in tags; + ii:start+til each lengths:sum each start cut matchingTag; + runs:`$" "sv/:string each doc[`tokens]start+til each lengths; + flip(runs;ii)} + +// Currently only for 2-gram +bi_gram:{[corpus] + tokens:raze corpus[`tokens]@'where each not corpus[`isStop]|corpus[`tokens]like\:"[0-9]*"; + occ:(distinct tokens)!{count where y=x}[tokens]each distinct tokens; + raze{[x;y;z;n](enlist(z;n))!enlist(count where n=x 1+where z=x)%y[z]}[tokens;occ]''[tokens;next tokens]} + +// Util + +// Find Regular expressions within texts +findRegex:{[text;expr]($[n;enlist;]expr)!$[n:1=count[expr];enlist;]{regex.matchAll[regex.objects[x];y]}[;text]each expr} + +// Remove any ascii characters from a text +ascii:{x where x within (0;127)} + +// Remove certain characters from a string of text +rmv_custom:{rtrim raze(l where{not(max ,'/)x like/:y}[;y]each l:" "vs x),'" "} + +// Remove and replace certain characters from a string of text +rmv_master:{{x:ssr[x;y;z];x}[;;z]/[x;y]} + +// Detect language from text +detectLang:{[text]`$.p.import[`langdetect][`:detect;<][text]} + +// Import all files in a dir recursively +loadTextFromDir:{[fp] + path:{[fp]raze$[-11=type k:key fp:hsym fp;fp;.z.s each` sv'fp,'k]}`$fp; + ([]fileName:(` vs'path)[;1];path;text:"\n"sv'read0 each path)} + +// Get all sentences for a doc +getSentences:i.getSentences + +// n-gram +ngram:{[corpus;n] + tokens:raze corpus[`tokens]@'where each not corpus[`isStop]|corpus[`tokens]like\:"[0-9]*"; + raze[key[b],/:'{key x}each value b]!raze value each value b:{(count each group x)%count x + }each last[tab]group neg[n-1]_flip(n-1)#tab:rotate\:[til n]tokens} diff --git a/parser.q b/code/parser.q similarity index 81% rename from parser.q rename to code/parser.q index a40b8da..2e4e705 100644 --- a/parser.q +++ b/code/parser.q @@ -2,10 +2,24 @@ .p.import[`sys;:;`:argv;()]; / spacy expects python be the main process +p)def spell(doc,model): + lst=[] + for s in doc: + if s._.hunspell_spell==False: + sug=s._.hunspell_suggest + if len(sug)>0: + ([lst.append(n)for n in model((sug)[0])]) + else:lst.append(s) + else: + lst.append(s) + return lst + // Python functions for running spacy p)def get_doc_info(parser,tokenAttrs,opts,text): - doc=parser(text) - res=[[getattr(w,a)for w in doc]for a in tokenAttrs] + doc=doc1=parser(text) + if('spell' in opts): + doc1=spell(doc,parser) + res=[[getattr(w,a)for w in doc1]for a in tokenAttrs] if('sentChars' in opts): # indices of first+last char per sentence res.append([(s.start_char,s.end_char)for s in doc.sents]) if('sentIndices' in opts): # index of first token per sentence @@ -49,8 +63,8 @@ parser.i.alphalang:(!). flip( (`zh;`Chinese)) // Create new parser -// Valid opts : text keywords likeEmail likeNumber likeURL isStop tokens lemmas uniPOS pennPOS starts sentChars sentIndices -parser.i.newParser:{[lang;opts] +// Valid opts : text keywords likeEmail likeNumber likeURL isStop tokens lemmas uniPOS pennPOS starts sentChars sentIndices spell +parser.newParser:{[lang;opts] opts:{distinct x,raze parser.i.depOpts x}/[colnames:opts]; disabled:`ner`tagger`parser except opts; model:parser.i.newSubParser[lang;opts;disabled]; @@ -65,6 +79,9 @@ parser.i.newSubParser:{[lang;opts;disabled] model:.p.import[$[`~chklng;`spacy;sv[`]`spacy.lang,lang]][hsym$[`~chklng;`load;chklng] ]. raze[$[`~chklng;lang;()];`disable pykw disabled]; if[`sbd in opts;model[`:add_pipe]$[`~chklng;model[`:create_pipe;`sentencizer];.p.pyget `x_sbd]]; + if[`spell in opts;if[not .p.import[`spacy.tokens][`:Token][`:has_extension]["hunspell_spell"]`; + sphun:.p.import[`spacy_hunspell]`:spaCyHunSpell;hunspell:sphun[model; + $[`Darwin~syst:`$.p.import[`platform][`:system][]`;`mac;lower syst]];model[`:add_pipe]hunspell]]; model} // Operations that must be done in q, or give better performance in q @@ -72,7 +89,7 @@ parser.i.runParser:{[pyParser;colnames;opts;stopwords;docs] t:parser.i.cleanUTF8 each docs; parsed:parser.i.unpack[pyParser;opts;stopwords]each t; if[`keywords in opts;parsed[`keywords]:TFIDF parsed]; - colnames#@[parsed;`text;:;t]} + (($[1=count colnames;enlist;]colnames) except `spell)#@[parsed;`text;:;t]} // Operations that must be done in q, or give better performance in q parser.i.unpack:{[pyParser;opts;stopwords;text] @@ -94,7 +111,7 @@ parser.i.unpack:{[pyParser;opts;stopwords;text] // Python indexes into strings by char instead of byte, so must be modified to index a q string parser.i.adjustIndices:{[text;doc] - adj:cont-til count cont:where text within"\200\277"; + adj:cont-til count cont:where ($[1~count text;enlist;]text) within"\200\277"; if[`starts in cols doc;doc[`starts ]+:adj binr 1+doc`starts ]; if[`sentChars in cols doc;doc[`sentChars]+:adj binr 1+doc`sentChars]; doc} diff --git a/regex.q b/code/regex.q similarity index 94% rename from regex.q rename to code/regex.q index c70c397..020b7b7 100644 --- a/regex.q +++ b/code/regex.q @@ -28,7 +28,5 @@ regex.patterns.yearmonthdayList:"(",sv["|";regex.patterns`year`month`day],")" regex.patterns.yearmonth: "(",sv[regex.patterns.dtsep;2#enlist regex.patterns.yearmonthList ],")" regex.patterns.yearmonthday: "(",sv[regex.patterns.dtsep;3#enlist regex.patterns.yearmonthdayList],")" -/regex.patterns.mnsep:"[\\t \\\\]+" -/regex.patterns.yearmonth: "(",sv[regex.patterns.mnsep;2#enlist regex.patterns.yearmonthList ],")" - regex.objects:regex.compile[;1b]each 1_regex.patterns + diff --git a/sent.q b/code/sent.q similarity index 73% rename from sent.q rename to code/sent.q index 1d52f7f..8e977a1 100644 --- a/sent.q +++ b/code/sent.q @@ -1,7 +1,7 @@ \d .nlp // Create regex used for tokenizing -sent.tokenPattern:{ +sent.i.tokenPattern:{ rightFacingEmoticons:"[<>]?[:;=8][\\-o\\*\\']?[\\)\\]\\(\\[dDpP/\\:\\}\\{@\\|\\\\]"; / n.b. Left-facing rarely used miscEmoticons:"<3|[0o][._][0o]|tokens; - upperIndices:where isUpperCase & not all isUpperCase; - valences[upperIndices]+:sent.ALLCAPS_INCR*signum valences upperIndices; - valences:sent.applyBoosters[tokens;isUpperCase;valences]; - valences:sent.negationCheck[tokens;valences]; - valences:sent.butCheck[tokens;valences]; - sent.scoreValence[0f^valences;text]} - // Calculate sentiment given individual valences -sent.scoreValence:{[valences;text] +sent.i.scoreValence:{[valences;text] if[not count valences;:`compound`pos`neg`neu!0 0 0 0f]; compound:sum valences; // Punctuation can increase the intensity of the sentiment - compound+:signum[compound]*punctAmplifier:sent.amplifyEP[text]+sent.amplifyQM text; + compound+:signum[compound]*punctAmplifier:sent.i.amplifyEP[text]+sent.i.amplifyQM text; // Normalize score compound:{x%sqrt 15+x*x}compound; // Discriminate between positive, negative and neutral sentiment scores @@ -131,3 +120,15 @@ sent.scoreValence:{[valences;text] // Used to noramlize the pos, neg and neutral sentiment total:positive+neutral+abs negative; `compound`pos`neg`neu!(compound,abs(positive;negative;neutral)%total)} + +// Calculate sentiment of a sentence of short message +sent.score:{[text] + valences:sent.i.lexicon tokens:lower rawTokens:sent.i.tokenize text; + isUpperCase:(rawTokens=upper rawTokens)& rawTokens<>tokens; + upperIndices:where isUpperCase & not all isUpperCase; + valences[upperIndices]+:sent.i.ALLCAPS_INCR*signum valences upperIndices; + valences:sent.i.applyBoosters[tokens;isUpperCase;valences]; + valences:sent.i.negationCheck[tokens;valences]; + valences:sent.i.butCheck[tokens;valences]; + sent.i.scoreValence[0f^valences;text]} + diff --git a/utils.q b/code/utils.q similarity index 92% rename from utils.q rename to code/utils.q index c522369..8f5f860 100644 --- a/utils.q +++ b/code/utils.q @@ -47,14 +47,6 @@ i.jaro:{[s1;s2] // Jaro-Winkler distance of 2 strings i.jaroWinkler:{$[0.7=4|.00002*count text; - dist:deltas each words#gt; - sigma:(dev each dist)%(avg each dist)*sqrt 1-(n:words#n)%count text; - std_sigma:1%sqrt[n]*1+2.8*n xexp -0.865; - chev_sigma:((2*n)-1)%2*n+1; - desc(sigma-chev_sigma)%std_sigma} - -// Give 2 dicts of each term's affinity to each corpus -// Algorithm from Rayson, Paul, and Roger Garside. "Comparing corpora using frequency profiling." -// Proceedings of the workshop on Comparing Corpora. Association for Computational Linguistics, 2000 -compareCorpora:{[corp1;corp2] - if[(not count corp1)|(not count corp2);:((`$())!();(`$())!())]; - getTermCount:{[corp] - i.fastSum{1+log count each group x}each corp[`tokens]@'where each not corp`isStop}; - totalWordCountA:sum termCountA:getTermCount corp1; - totalWordCountB:sum termCountB:getTermCount corp2; - // The expected termCount of each term in each corpus - coef:(termCountA+termCountB)%(totalWordCountA+totalWordCountB); - expectedA:totalWordCountA*coef; - expectedB:totalWordCountB*coef; - // Return the differences between the corpora - (desc termCountA*log termCountA%expectedA;desc termCountB*log termCountB%expectedB)} - -// Calc cosine similarity of two docs -compareDocs:{cosineSimilarity .(x;y)@\:distinct raze key each(x;y)} - -// Compare similarity of 2 vectors -cosineSimilarity:{sum[x*y]%(sqrt sum x*x)*sqrt sum y*y} - -// How much each term contributes to the cosine similarity -explainSimilarity:{[doc1;doc2] - alignedKeys:inter[key doc1;key doc2]; - doc1@:alignedKeys; - doc2@:alignedKeys; - product:(doc1%i.magnitude doc1)*(doc2%i.magnitude doc2); - desc alignedKeys!product%sum product} - -// Find runs containing term where each word has above average co-ocurrance with term -extractPhrases:{[corpus;term] - relevant:term,sublist[150]where 01)#r:count each group r where term in/:r:raze tokens@'runs} - -// Find all dates : list of 5-tuples (startDate; endDate; dateText; startIndex; 1+endIndex) -findDates:tm.findDates - -// Find all times : list of 4-tuples (time; timeText; startIndex; 1+endIndex) -findTimes:tm.findTimes - -// Get all sentences for a doc -getSentences:i.getSentences - -// Find runs of tokens whose POS tags are in the set passed in -// Returns pair (text; firstIndex) -findPOSRuns:{[tagType;tags;doc] - start:where 1=deltas matchingTag:doc[tagType]in tags; - ii:start+til each lengths:sum each start cut matchingTag; - runs:`$" "sv/:string each doc[`tokens]start+til each lengths; - flip(runs;ii)} - -// Generate feature vector (of stemmed tokens) for a term -findRelatedTerms:{[docs;term] - sent:raze docs[`sentIndices]cut'@'[docs[`tokens];where each docs`isStop;:;`]; - sent@:asc distinct raze 0|-1 0 1+\:where(term:lower term)in/:sent; - ccur:` _ count each group raze distinct each sent; - tcur:idx@'group each docs[`tokens]@'idx:where each docs[`tokens]in\:key ccur; - tcur:i.fastSum((count distinct@)each)each docs[`sentIndices]bin'tcur; - ccur%:tcur term; - tcur%:sum count each docs`sentIndices; - desc except[where r>0;term]#r:(ccur-tcur)%sqrt tcur*1-tcur} - -// Jaro-Winkler distance between 2 strings -jaroWinkler:{i.jaroWinkler[lower x;lower y]} - -// Import all files in a dir recursively -loadTextFromDir:{[fp] - path:{[fp]raze$[-11=type k:key fp:hsym fp;fp;.z.s each` sv'fp,'k]}`$fp; - ([]fileName:(` vs'path)[;1];path;text:"\n"sv'read0 each path)} - -// Read an mbox file, converting it to a table with the parsed metadata -loadEmails:email.i.getMboxText - -// Create a new parser using a spaCy model (must already be installed) -newParser:parser.i.newParser - -// Detect language from text -detectLang:{[text]`$.p.import[`langid][`:classify;<][raze text]0} - -// Parse urls to dictionaries -parseURLs:{`scheme`domainName`path`parameters`query`fragment!i.parseURLs x} - -// Calculate sentiment of sentence of short message -sentiment:sent.score - -// Phonological representation of string (commented out for now) -/doubleMetaphone:.p.import[`metaphone;`:doublemetaphone;<] +path:{string`nlp^`$@[{"/"sv -1_"/"vs ssr[;"\\";"/"](-3#get .z.s)0};`;""]}` +loadfile:{$[.z.q;;-1]"Loading ",x:_[":"=x 0]x:$[10=type x;;string]x;system"l ",path,"/",x;} diff --git a/requirements.txt b/requirements.txt index c3843f2..c08bc92 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy beautifulsoup4 -spacy +spacy==2.2.1 +langdetect diff --git a/tests/clustertest.t b/tests/clustertest.t index 3503831..6bec5fb 100644 --- a/tests/clustertest.t +++ b/tests/clustertest.t @@ -1,7 +1,8 @@ +\l nlp.q \l init.q \d .nlp -text: first (enlist "*";",";1) 0: `:./data/miniJeff.txt -p:newParser[`en; enlist`keywords] +text: first (enlist "*";",";1) 0: `:tests/data/miniJeff.txt +p:newParser[`en;`keywords] corpus:p text emptyDoc:([] keywords:enlist ()!()) truncate:{[precision; x]coefficient: 10 xexp precision;reciprocal[coefficient]*`long$coefficient*x} @@ -78,17 +79,17 @@ orthogonalDocs:(`a`b!1 1f;`c`d!1 1f;`e`f!1 1f) .2 ~ truncate[2] cluster.MSE corpus[1 3 5 7 9; `keywords] corpus:p ("beep beep beep";"In Brittany, the Bretons play the bombard";"The Bretons of Brittany enjoy bombard music";"A special hand tool is needed to adjust a bike chain to the right length";"A chain whip is a tool used by a bike mechanic";"Chain oil is recommeneded instead of WD-40"; "A bike mechanic frequently gets chain oil on their hands"; "I enjoy medieval music";"The lute is a common medieval instrument";"Wire strings are common on medieval harps";"Lutes have too many strings";"Medieval wind instruments are also abundant";"No medieval wind instruments had strings";"The modern harp has mostly nylon strings";"Modern music is much less shrill"); centroids:sum each corpus[`keywords] (enlist 0;1 + til 2;3 + til 4;7 + til 8) -cluster.i.groupByCentroids[centroids; 1 _ corpus `keywords] ~ -1 + (1 2;3 4 5 6;7 8 9 10 11 12 13 14) -cluster.i.groupByCentroids[centroids enlist 0; 1 _ corpus `keywords]~ enlist til 14 -cluster.i.groupByCentroids[centroids 1 2; corpus `keywords]~ (0 8 9 10 11 12 13;1 2 7 14;3 4 5 6) -cluster.i.groupByCentroids[centroids; corpus `keywords]~(enlist 0;1 2;3 4 5 6;7 8 9 10 11 12 13 14) -cluster.i.groupByCentroids[centroids 0 1 2;()] ~ () -cluster.i.groupByCentroids[centroids enlist 2; corpus `keywords] ~ (0 1 2 7 8 9 10 11 12 13 14; 3 4 5 6) -cluster.i.groupByCentroids[centroids; corpus[enlist 0] `keywords]~ enlist enlist 0 -(til 15) ~ asc raze cluster.i.groupByCentroids[1_centroids;corpus`keywords] +cluster.groupByCentroids[centroids; 1 _ corpus `keywords] ~ -1 + (1 2;3 4 5 6;7 8 9 10 11 12 13 14) +cluster.groupByCentroids[centroids enlist 0; 1 _ corpus `keywords]~ enlist til 14 +cluster.groupByCentroids[centroids 1 2; corpus `keywords]~ (0 8 9 10 11 12 13;1 2 7 14;3 4 5 6) +cluster.groupByCentroids[centroids; corpus `keywords]~(enlist 0;1 2;3 4 5 6;7 8 9 10 11 12 13 14) +cluster.groupByCentroids[centroids 0 1 2;()] ~ () +cluster.groupByCentroids[centroids enlist 2; corpus `keywords] ~ (0 1 2 7 8 9 10 11 12 13 14; 3 4 5 6) +cluster.groupByCentroids[centroids; corpus[enlist 0] `keywords]~ enlist enlist 0 +(til 15) ~ asc raze cluster.groupByCentroids[1_centroids;corpus`keywords] \d . -text: first (enlist "*";",";1) 0: `:./data/miniJeff.txt -p:.nlp.newParser[`en; enlist`keywords] +text: first (enlist "*";",";1) 0: `:tests/data/miniJeff.txt +p:.nlp.newParser[`en;`keywords] corpus:p text emptyDoc:([] keywords:enlist ()!()) cluster:.nlp.cluster.summarize[corpus;10] diff --git a/data/message.mbox b/tests/data/message.mbox similarity index 100% rename from data/message.mbox rename to tests/data/message.mbox diff --git a/data/miniJeff.txt b/tests/data/miniJeff.txt similarity index 100% rename from data/miniJeff.txt rename to tests/data/miniJeff.txt diff --git a/data/test.mbox b/tests/data/test.mbox similarity index 100% rename from data/test.mbox rename to tests/data/test.mbox diff --git a/tests/emailtest.t b/tests/emailtest.t index f2e1ea3..31fa54f 100644 --- a/tests/emailtest.t +++ b/tests/emailtest.t @@ -1,7 +1,8 @@ +\l nlp.q \l init.q \d .nlp -lines: read0 `:./data/test.mbox; -emails:email.i.parseMail each "\n" sv/: (where lines like "From *") cut lines; +lines: read0 `:tests/data/test.mbox; +emails:email.parseMail each "\n" sv/: (where lines like "From *") cut lines; to: 9#enlist enlist("";"john.doe@domain.com"); to[0;0;0]:"John Doe"; emails[`to]~to diff --git a/tests/nlptest.t b/tests/nlptest.t index dd68b2e..778610f 100644 --- a/tests/nlptest.t +++ b/tests/nlptest.t @@ -1,9 +1,10 @@ +\l nlp.q \l init.q \d .nlp charPosParser:newParser[`en; `sentChars`starts`tokens] doc:first charPosParser enlist text:"Le café noir était pour André Benoît. Mes aïeux été vieux." all(doc[`tokens]~`$("le";"café";"noir";"était";"pour";"andré";"benoît";"mes";"aïeux";"été";"vieux");(doc[`starts] cut text)~("Le ";"café ";"noir ";"était ";"pour ";"André ";"Benoît. ";"Mes ";"aïeux ";"été ";"vieux.");(doc[`sentChars;;0] cut text)~("Le café noir était pour André Benoît. ";"Mes aïeux été vieux.");((0,doc[`sentChars;;1]) cut text)~("Le café noir était pour André Benoît.";" Mes aïeux été vieux.";"")) -text: first (enlist "*";",";1) 0: `:./data/miniJeff.txt +text: first (enlist "*";",";1) 0: `:tests/data/miniJeff.txt p:newParser[`en; `tokens`isStop]; corpus:p text; keywords:TFIDF corpus; @@ -14,7 +15,10 @@ keywords[0; `billion] > keywords[0; `transacting] enlist[(`u#`$())!()]~TFIDF([]tokens:enlist `$(); isStop:enlist `boolean$()); keywords:TFIDF enlist corpus 1; 98h~type keywords -p:newParser[`en; enlist`keywords]; +keywords_tot:TFIDF_tot corpus +keywords_tot[`erv]~keywords_tot[`published] +keywords_tot[`mpr] > keywords_tot[`attached] +p:newParser[`en;`keywords]; corpus:p text; 1f~compareDocs . corpus[`keywords]0 0 0f~compareDocs[(enlist`a)!enlist 1;(enlist `b)!enlist 1] @@ -27,6 +31,10 @@ truncate:{[precision; x]coefficient: 10 xexp precision;reciprocal[coefficient] * 0f~truncate[3] cosineSimilarity[0 1; 1 0] 1f~truncate[3] cosineSimilarity[0 1; 0 1] 1f~truncate[3] cosineSimilarity[1; 1] +centroid:sum corpus`keywords +1 1f~2#desc compareDocToCentroid[centroid]each corpus`keywords +1 1 1 1f~4#desc compareDocToCorpus[corpus`keywords;0] +0 0 0f~3#asc compareDocToCorpus[corpus`keywords;0] explainSimilarity[(`a`b`c)!(.1 .2 .3);(`e`f`g)!(.1 .2 .3)]~(`$())!`float$() all(explainSimilarity[(`a`b`c)!(.1 .2 .3); (`$())!(`float$())]~(`$())!(`float$());explainSimilarity[(`$())!(`float$());(`a`b`c)!(.1 .2 .3)]~(`$())!(`float$());explainSimilarity[(`$())!(`float$());(`$())!(`float$())]~(`$())!(`float$())) all(explainSimilarity[(enlist `a)!enlist .1;(enlist `a)!enlist .1]~(enlist `a)!enlist 1f;explainSimilarity[(enlist `a)!enlist .1;(enlist `a)!enlist .5]~(enlist `a)!enlist 1f;explainSimilarity[(enlist `a)!enlist .1;(enlist `b)!enlist .5]~(`$())!(`float$())) @@ -60,7 +68,7 @@ posParser:newParser[`en; `uniPOS`pennPOS`tokens] findPOSRuns[`uniPOS; `ADV`VERB;first posParser enlist". ."]~() findPOSRuns[`uniPOS; `DET;first posParser enlist "The"]~enlist(`the; enlist 0) findPOSRuns[`uniPOS; `VERB;first posParser enlist"The train from nowhere"]~() -findPOSRuns[`uniPOS; `VERB;first posParser enlist"has been gone dancing"]~enlist(`$"has been gone dancing";0 1 2 3) +findPOSRuns[`uniPOS; `VERB;first posParser enlist"has been gone dancing"]~enlist(`$"gone dancing";2 3) doc:first posParser enlist"Wade Hemsworth famously surveyed the Abitibi Waterways in North Ontario."; all(findPOSRuns[`uniPOS;`DET`PROPN;doc];findPOSRuns[`pennPOS;`DT`NNP`NNPS; doc])~\:((`$"wade hemsworth"; 0 1);(`$"the abitibi waterways"; 4 5 6);(`$"north ontario"; 8 9)) p:newParser[`en;`tokens`isStop`sentIndices]; @@ -98,35 +106,40 @@ keywords:keywordsContinuous enlist doc; 99h ~ type keywords keywords:keywordsContinuous corpus; {x~desc x} keywords `chairman`chief`group`enron`thanks`mountains -emails:.nlp.loadEmails["data/test.mbox"] +(1 1f,(2%3),(1%3),0.5 0.5 0.5 0.5 0.5 0.5)~value 10#ngram[enlist first corpus;2] +1 1 .5 .5 1 1 1 1 1 1f~value 10#ngram[enlist first corpus;3] +((`enrononline`management`report);(`management`report`june);(`report`june`attached))~key 3#ngram[enlist first corpus;3] +emails:email.loadEmails["tests/data/test.mbox"] `sender`to`date`subject`contentType`payload`text~cols emails -(last .nlp.loadEmails["data/test.mbox"]`text)~"Your email client does not support HTML mails." +(last emails`text)~"Your email client does not support HTML mails." ("multipart/alternative";"multipart/alternative";"multipart/alternative";"multipart/alternative";"multipart/alternative";"multipart/alternative";"text/html";"multipart/alternative";"multipart/alternative")~emails`contentType +`sender`to`volume~cols email.getGraph emails +1~(last email.getGraph emails)`volume parseURLs["http://www.google.com"]~`scheme`domainName`path`parameters`query`fragment!("http";"www.google.com";"";"";"";"") parseURLs["ssh://samsquanch@mx4.hotmail.com"][`scheme`domainName]~("ssh";"samsquanch@mx4.hotmail.com") parseURLs["https://www.google.ca:1234/test/index.html;myParam?foo=bar&quux=blort#abc=123&def=456"]~(!) . flip ((`scheme;"https");(`domainName;"www.google.ca:1234");(`path;"/test/index.html");(`parameters; "myParam");(`query;"foo=bar&quux=blort");(`fragment;"abc=123&def=456")) all(parseURLs["google.ca/test/index.html"][`scheme`domainName`path]~("http";"google.ca";"/test/index.html");parseURLs["www.google.co.uk"][`scheme`domainName`path]~("http";"www.google.co.uk";"")) parseURLs["https://网站.中国.com"]~`scheme`domainName`path`parameters`query`fragment!("https";"网站.中国.com";"";"";"";"") (parseURLs each ("https://travel.gc.ca/";"https://www.canada.ca/en/revenue-agency.html"))~([]scheme:("https"; "https");domainName:("travel.gc.ca"; "www.canada.ca");path:(enlist "/";"/en/revenue-agency.html");parameters: (""; "");query:(""; "");fragment:(""; "")) -\d . -text: first (enlist "*";",";1) 0: `:./data/miniJeff.txt -p:.nlp.newParser[`en;`tokens`isStop`text] +seq:bi_gram[corpus] +seq[`enrononline`management]~1f +seq[`management`report]>seq[`report`june] +`en~detectLang["This is a sentence"] +`de~detectLang["Das ist ein Satz"] +`fr~detectLang["C'est une phrase"] +ascii["This is ä senteñcê"]~"This is sentec" +rmv_list :("http*";"*,";"*&*";"*[0-9]*") +rmv_custom["https//:google.com & https//:bing.com are 2 search engines!";rmv_list]~"are search engines!" +rmv_master["https//:google.com & https//:bing.com are 2 search engines!";",.:?!/@'\n";""]~"httpsgooglecom & httpsbingcom are 2 search engines" +loadDir:loadTextFromDir["tests/data/test.mbox"] +`fileName`path`text~cols loadDir +loadDir[`fileName]~enlist `test.mbox +text: first (enlist "*";",";1) 0: `:tests/data/miniJeff.txt +p:newParser[`en;`tokens`isStop`text] corpus:p text phonecall:corpus i:where corpus[`text] like "*Telephone Call*" -remaining:corpus til[count corpus]except i -(`message`murdock`erica`error`jerry;`enron`know`let,`meeting`company)~key each 5#/:.nlp.compareCorpora[phonecall;remaining] - - - - - - - - - - - - - - - +remaining:corpus til[count corpus]except n +(`message`murdock`erica`error`jerry;`enron`know`let,`meeting`company)~key each 5#/:compareCorpora[phonecall;remaining] +txt:"You can call the number 123 456 7890 or email us on name@email.com in book an appoinment for January,February and March for £30.00" +findRegex[txt;`phoneNumber`emailAddress`yearmonthList`money]~`phoneNumber`emailAddress`yearmonthList`money!(enlist (" 123 456 7890";23;36);enlist("name@email.com";52;66);(("January";93;100);("February";101;109);("March";114;119);("30";125;127);("00";128;130));enlist("\302\24330.00";124;130)) +\d . diff --git a/tests/parsertest.t b/tests/parsertest.t index f669d2d..c20334e 100644 --- a/tests/parsertest.t +++ b/tests/parsertest.t @@ -1,7 +1,8 @@ +\l nlp.q \l init.q \d .nlp -basicParser:newParser[`en;enlist `tokens]; -keywordParser:newParser[`en; enlist `keywords] +basicParser:newParser[`en;`tokens]; +keywordParser:newParser[`en;`keywords] allSpacyOptionsParser:newParser[`en;`likeEmail`likeURL`likeNumber`isStop`tokens`lemmas`uniPOS`pennPOS`starts]; allQOptionsParser:newParser[`en; `keywords`sentChars`sentIndices]; textPreservingParser:newParser[`en; `tokens`text]; @@ -17,7 +18,7 @@ all(keywords[0;`lacrosse] > keywords[0;`team];keywords[0;`lacrosse] < keywords[4 docs: ("The great Québec maple syrup heist"; "Québec is great"); cols[keywordParser docs] ~ enlist `keywords result:allSpacyOptionsParser enlist"Email Jeff Bezos at jeff@amazon.com. He gets 65,536 emails a day from people asking about www.blueorigin.com or https://amazon.ca."; -all(cols[result] ~`likeEmail`likeURL`likeNumber`isStop`tokens`lemmas`uniPOS`pennPOS`starts;result[`likeEmail] ~enlist 000010000000000000b;result[`likeURL]~enlist 000000000000000101b;result[`likeNumber]~enlist 000000010000000000b;result[`isStop]~enlist 000101110101001010b;result[`tokens]~enlist `email`jeff`bezos`at,(`$"jeff@amazon.com"),`he`gets,(`$"65,536"),`emails`a`day`from`people`asking`about`www.blueorigin.com`or`https://amazon.ca;result[`lemmas]~ enlist `Email`Jeff`Bezos`at,(`$"jeff@amazon.com"),(`$"-PRON-"),`get,(`$"65,536"),`email`a`day`from`people`ask`about`www.blueorigin.com`or`https://amazon.ca;result[`uniPOS]~ enlist `PROPN`PROPN`PROPN`ADP`X`PRON`VERB`NUM`NOUN`DET`NOUN`ADP`NOUN`VERB`ADP`NOUN`CCONJ`NOUN;result[`pennPOS] ~ enlist `NNP`NNP`NNP`IN`ADD`PRP`VBZ`CD`NNS`DT`NN`IN`NNS`VBG`IN`NN`CC`NN;result[`starts]~enlist 0 6 11 17 20 37 40 45 52 59 61 65 70 77 84 90 109 112) +all(cols[result] ~`likeEmail`likeURL`likeNumber`isStop`tokens`lemmas`uniPOS`pennPOS`starts;result[`likeEmail] ~enlist 000010000000000000b;result[`likeURL]~enlist 000000000000000101b;result[`likeNumber]~enlist 000000010000000000b;result[`isStop]~enlist 000101110101001010b;result[`tokens]~enlist `email`jeff`bezos`at,(`$"jeff@amazon.com"),`he`gets,(`$"65,536"),`emails`a`day`from`people`asking`about`www.blueorigin.com`or`https://amazon.ca;result[`lemmas]~ enlist `email`Jeff`Bezos`at,(`$"jeff@amazon.com"),(`$"-PRON-"),`get,(`$"65,536"),`email`a`day`from`people`ask`about`www.blueorigin.com`or`https://amazon.ca;result[`uniPOS]~ enlist `NOUN`PROPN`PROPN`ADP`X`PRON`VERB`NUM`NOUN`DET`NOUN`ADP`NOUN`VERB`ADP`X`CCONJ`PROPN;result[`pennPOS] ~ enlist `NN`NNP`NNP`IN`ADD`PRP`VBZ`CD`NNS`DT`NN`IN`NNS`VBG`IN`ADD`CC`NNP;result[`starts]~enlist 0 6 11 17 20 37 40 45 52 59 61 65 70 77 84 90 109 112) result:allQOptionsParser[enlist"O, the year was 1778 how I wish I was in Sherbrooke now. A letter of marque came from the king."]; all(cols[result]~`keywords`sentChars`sentIndices;result[`keywords]~ enlist `o`year`wish`sherbrooke`letter`marque`came`king!8#0.125;result[`sentChars] ~ enlist (0 56; 57 95);result[`sentIndices] ~ enlist 0 13) result:first sentenceParser enlist" Hornpipe, jig, and reel. \nThis is a good song" diff --git a/tests/senttest.t b/tests/senttest.t index cbee02a..bb0d84d 100644 --- a/tests/senttest.t +++ b/tests/senttest.t @@ -1,23 +1,24 @@ +\l nlp.q \l init.q \d .nlp -sent.amplifyEP[""]~0f -sent.amplifyEP[enlist "!"]~.292 -0 .292 .584 .876 1.168 1.168 ~sent.amplifyEP each ("ok"; "bad!"; "no!worse!"; "terrible!!!"; "ghastly!!!! eew"; "!!!!!!!!!!") -sent.amplifyQM[""]~0f -sent.amplifyQM[enlist "?"]~0f -0 0 0.36 0.54 0.96 0.96~sent.amplifyQM each ("yes"; "oh?"; "oh? really?"; "you don't say???"; "forsooth????"; "????????????") -all (sent.findSequence[`a`b`c`d;enlist`c]~enlist 2;sent.findSequence[`c`b`c`d; enlist `c] ~ 0 2) -all (sent.findSequence[`a`b`c`d`e`f;`c`d]~enlist 2;sent.findSequence[`a`b`c`d`e`f`c`d; `c`d] ~ 2 6;sent.findSequence[`a`b`c`d`e`f`a`b`c`d`e`g`a`b`c`d; `a`b`c`d] ~ 0 6 12) -sent.findSequence[`a`b`c`d;`c]~enlist 2; -sent.findSequence[`$();`a`b`c]~`long$() -all(sent.findSequence[enlist`a;`a]~enlist 0;sent.findSequence[enlist`a;`b]~`long$()) -sent.findSequence[`a`b`c`d`e`a;`a]~0 5 -sent.findSequence[0 0 4 5 1 2 4 5;4 5]~2 6 -sent.findSequence["Facebook,Tim Cook";"oo"]~5 14 -sent.butCheck[`$(); `float$()] ~ `float$() -all(sent.butCheck[enlist `good; enlist 2f] ~ enlist 2f;sent.butCheck[enlist`but;enlist 0f]~enlist 0f) -all(sent.butCheck[`that`was`good`but; 0 0 1 0f] ~ 0 0 .5 0f;sent.butCheck[`that`was`good`but`it; 0 0 1 0 0f] ~ 0 0 .5 0 0f;sent.butCheck[`but`it`was`ok; 0 0 0 1f] ~ 0 0 0 1.5f;sent.butCheck[`tasty`but`it`smelled`bad; 2 0 0 -1.5 -2f] ~ 1 0 0 -2.25 -3f) -sent.butCheck[`it`was`good`and`useful`but`boring`and`gross;0 0 1 0 1.5 0 -1 0 -2]~0 0 .5 0 .75 0 -1.5 0 -3 +sent.i.amplifyEP[""]~0f +sent.i.amplifyEP[enlist "!"]~.292 +0 .292 .584 .876 1.168 1.168 ~sent.i.amplifyEP each ("ok"; "bad!"; "no!worse!"; "terrible!!!"; "ghastly!!!! eew"; "!!!!!!!!!!") +sent.i.amplifyQM[""]~0f +sent.i.amplifyQM[enlist "?"]~0f +0 0 0.36 0.54 0.96 0.96~sent.i.amplifyQM each ("yes"; "oh?"; "oh? really?"; "you don't say???"; "forsooth????"; "????????????") +all (sent.i.findSequence[`a`b`c`d;enlist`c]~enlist 2;sent.i.findSequence[`c`b`c`d; enlist `c] ~ 0 2) +all (sent.i.findSequence[`a`b`c`d`e`f;`c`d]~enlist 2;sent.i.findSequence[`a`b`c`d`e`f`c`d; `c`d] ~ 2 6;sent.i.findSequence[`a`b`c`d`e`f`a`b`c`d`e`g`a`b`c`d; `a`b`c`d] ~ 0 6 12) +sent.i.findSequence[`a`b`c`d;`c]~enlist 2; +sent.i.findSequence[`$();`a`b`c]~`long$() +all(sent.i.findSequence[enlist`a;`a]~enlist 0;sent.i.findSequence[enlist`a;`b]~`long$()) +sent.i.findSequence[`a`b`c`d`e`a;`a]~0 5 +sent.i.findSequence[0 0 4 5 1 2 4 5;4 5]~2 6 +sent.i.findSequence["Facebook,Tim Cook";"oo"]~5 14 +sent.i.butCheck[`$(); `float$()] ~ `float$() +all(sent.i.butCheck[enlist `good; enlist 2f] ~ enlist 2f;sent.i.butCheck[enlist`but;enlist 0f]~enlist 0f) +all(sent.i.butCheck[`that`was`good`but; 0 0 1 0f] ~ 0 0 .5 0f;sent.i.butCheck[`that`was`good`but`it; 0 0 1 0 0f] ~ 0 0 .5 0 0f;sent.i.butCheck[`but`it`was`ok; 0 0 0 1f] ~ 0 0 0 1.5f;sent.i.butCheck[`tasty`but`it`smelled`bad; 2 0 0 -1.5 -2f] ~ 1 0 0 -2.25 -3f) +sent.i.butCheck[`it`was`good`and`useful`but`boring`and`gross;0 0 1 0 1.5 0 -1 0 -2]~0 0 .5 0 .75 0 -1.5 0 -3 compare:{value (floor 1000* sent.score x) % 1000} all(compare[""]~0 0 0 0f;compare["\t\t\r\n\n"]~0 0 0 0f;compare["a b c 1"]~0 0 0 0f) all(compare["bad"]~-.543 0 1 0f;compare["racist"]~-.613 0 1 0f;compare["good"]~.44 1 0 0f;compare["free"] ~.51 1 0 0f;compare["those"]~0 0 0 1f;compare["123"]~0 0 0 1f) diff --git a/tests/utilstest.t b/tests/utilstest.t index 64f99d1..6e7414d 100644 --- a/tests/utilstest.t +++ b/tests/utilstest.t @@ -1,3 +1,4 @@ +\l nlp.q \l init.q \d .nlp all(i.findRuns[where 000000000b]~();i.findRuns[where 100000000b]~();i.findRuns[where 100000001b]~();i.findRuns[where 101010101b]~();i.findRuns[where 100100100b]~()) diff --git a/time.q b/time.q deleted file mode 100644 index ccc48e0..0000000 --- a/time.q +++ /dev/null @@ -1,10 +0,0 @@ -\d .nlp - -// Turns string matching time regex into a q time -tm.parseTime:{ - tm:"T"$x where vs[" ";x][0]in"1234567890:."; - ampm:regex.check[;x]each regex.objects`am`pm; - tm+$[ampm[0]&12=`hh$tm;-1;ampm[1]&12>`hh$tm;1;0]*12:00} - -// Find all times : list of 4-tuples (time; timeText; startIndex; 1+endIndex) -tm.findTimes:{time:(tm.parseTime each tmtxt[;0]),'tmtxt:regex.matchAll[regex.objects.time;x]; time where time[;0]<24:01}