Skip to content

Commit

Permalink
related to #33 and #34
Browse files Browse the repository at this point in the history
  • Loading branch information
arademaker committed Feb 6, 2023
1 parent 45cab32 commit 715a40d
Show file tree
Hide file tree
Showing 4 changed files with 851 additions and 1 deletion.
2 changes: 1 addition & 1 deletion data/ann/annotation-bi.jl
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@
{"_id":"noun.animal-2399791","doc_id":"noun.animal","type":"n","sent_id":2399791,"text":"the third compartment of the stomach of a ruminant","tokens":[{"kind":["def"],"action":"open"},{"form":"the","kind":["wf"],"lemmas":["the"],"tag":"ignore","pos":"DT"},{"form":"third","kind":["wf"],"lemmas":["third%1","third%3","third%4"],"tag":"un","pos":"JJ"},{"form":"compartment","kind":["wf"],"lemmas":["compartment%1"],"senses":["compartment%1:06:00::"],"tag":"man","pos":"NN"},{"form":"of","kind":["wf"],"lemmas":["of"],"tag":"ignore","pos":"IN"},{"form":"the","kind":["wf"],"lemmas":["the"],"tag":"ignore","pos":"DT"},{"form":"stomach","kind":["wf"],"lemmas":["stomach%1","stomach%2"],"senses":["stomach%1:08:00::"],"tag":"man","pos":"NN"},{"form":"of","kind":["wf"],"lemmas":["of"],"tag":"ignore","pos":"IN"},{"form":"a","kind":["wf"],"lemmas":["a"],"tag":"ignore","pos":"DT"},{"form":"ruminant","kind":["wf"],"lemmas":["ruminant%1","ruminant%3"],"senses":["ruminant%1:05:00::"],"tag":"man","pos":"NN","sep":""},{"kind":["def"],"action":"close"}],"keys":["third_stomach%1:05:00::","omasum%1:05:00::","psalterium%1:05:00::"],"ofs":"02399791","terms":["third stomach","omasum","psalterium"]}
{"_id":"noun.animal-2398521","doc_id":"noun.animal","type":"n","sent_id":2398521,"text":"massive thick-skinned herbivorous animal living in or around rivers of tropical Africa","tokens":[{"kind":["def"],"action":"open"},{"form":"massive","kind":["wf"],"lemmas":["massive%3"],"senses":["massive%5:00:00:large:00"],"tag":"man","pos":"JJ"},{"form":"thick-skinned","kind":["wf"],"lemmas":["thick-skinned%3"],"senses":["thick-skinned%5:00:00:insensitive:02"],"tag":"auto","pos":"JJ"},{"form":"herbivorous","kind":["wf"],"lemmas":["herbivorous%3"],"senses":["herbivorous%3:00:00::"],"tag":"auto","pos":"JJ"},{"form":"animal","kind":["wf"],"lemmas":["animal%1","animal%3"],"senses":["animal%1:03:00::"],"tag":"man","pos":"NN"},{"form":"living","kind":["wf"],"lemmas":["living%1","live%2","living%3"],"tag":"un","pos":"VBG"},{"form":"in","kind":["wf"],"lemmas":["in"],"tag":"ignore","pos":"IN"},{"form":"or","kind":["wf"],"lemmas":["or"],"tag":"ignore","pos":"CC"},{"form":"around","kind":["wf"],"lemmas":["around%4"],"tag":"un","pos":"IN"},{"form":"rivers","kind":["wf"],"lemmas":["river%1"],"senses":["river%1:17:00::"],"tag":"auto","pos":"NNS"},{"form":"of","kind":["wf"],"lemmas":["of"],"tag":"ignore","pos":"IN"},{"form":"tropical","kind":["wf"],"lemmas":["tropical%3"],"tag":"un","pos":"JJ"},{"form":"Africa","kind":["wf"],"lemmas":["Africa%1"],"senses":["africa%1:17:00::"],"tag":"auto","pos":"NNP","sep":""},{"kind":["def"],"action":"close"}],"keys":["hippopotamus_amphibius%1:05:00::","river_horse%1:05:00::","hippo%1:05:00::","hippopotamus%1:05:00::"],"ofs":"02398521","terms":["Hippopotamus amphibius","river horse","hippo","hippopotamus"]}
{"_id":"noun.animal-2399648","doc_id":"noun.animal","type":"n","sent_id":2399648,"text":"the second compartment of the stomach of a ruminant","tokens":[{"kind":["def"],"action":"open"},{"form":"the","kind":["wf"],"lemmas":["the"],"tag":"ignore","pos":"DT"},{"form":"second","kind":["wf"],"lemmas":["second%1","second%2","second%3","second%4"],"tag":"un","pos":"JJ"},{"form":"compartment","kind":["wf"],"lemmas":["compartment%1"],"senses":["compartment%1:06:00::"],"tag":"man","pos":"NN"},{"form":"of","kind":["wf"],"lemmas":["of"],"tag":"ignore","pos":"IN"},{"form":"the","kind":["wf"],"lemmas":["the"],"tag":"ignore","pos":"DT"},{"form":"stomach","kind":["wf"],"lemmas":["stomach%1","stomach%2"],"senses":["stomach%1:08:00::"],"tag":"man","pos":"NN"},{"form":"of","kind":["wf"],"lemmas":["of"],"tag":"ignore","pos":"IN"},{"form":"a","kind":["wf"],"lemmas":["a"],"tag":"ignore","pos":"DT"},{"form":"ruminant","kind":["wf"],"lemmas":["ruminant%1","ruminant%3"],"senses":["ruminant%1:05:00::"],"tag":"man","pos":"NN","sep":""},{"kind":["def"],"action":"close"}],"keys":["second_stomach%1:05:00::","reticulum%1:05:00::"],"ofs":"02399648","terms":["second stomach","reticulum"]}
{"_id":"noun.animal-2400139","doc_id":"noun.animal","type":"n","sent_id":2400139,"text":"true antelopes; cattle; oxen; sheep; goats","tokens":[{"kind":["def"],"action":"open"},{"form":"true","kind":["wf"],"lemmas":["true%1","true%2","true%3","true%4"],"senses":["true%5:00:00:typical:00"],"tag":"man","pos":"JJ"},{"form":"antelopes","kind":["wf"],"lemmas":["antelope%1"],"senses":["antelope%1:05:00::"],"tag":"auto","pos":"NN","sep":""},{"form":";","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"cattle","kind":["wf"],"lemmas":["cattle%1"],"senses":["cattle%1:05:00::"],"tag":"auto","pos":"NNS","sep":""},{"form":";","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"oxen","kind":["wf"],"lemmas":["ox%1","oxen%1"],"senses":["oxen%1:05:00::"],"tag":"man","pos":"NNS","sep":""},{"form":";","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"sheep","kind":["wf"],"lemmas":["sheep%1"],"tag":"un","pos":"NNS","sep":""},{"form":";","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"goats","kind":["wf"],"lemmas":["goat%1"],"senses":["goat%1:05:00::"],"tag":"man","pos":"NNS","sep":""},{"kind":["def"],"action":"close"}],"keys":["family_bovidae%1:05:00::","bovidae%1:05:00::"],"ofs":"02400139","terms":["family Bovidae","Bovidae"]}
{"_id":"noun.animal-2400139","doc_id":"noun.animal","type":"n","sent_id":2400139,"text":"true antelopes, cattle, oxen, sheep, goats","tokens":[{"kind":["def"],"action":"open"},{"form":"true","kind":["wf"],"lemmas":["true%1","true%2","true%3","true%4"],"senses":["true%5:00:00:typical:00"],"tag":"man","pos":"JJ"},{"form":"antelopes","kind":["wf"],"lemmas":["antelope%1"],"senses":["antelope%1:05:00::"],"tag":"auto","pos":"NN","sep":""},{"form":",","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"cattle","kind":["wf"],"lemmas":["cattle%1"],"senses":["cattle%1:05:00::"],"tag":"auto","pos":"NNS","sep":""},{"form":",","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"oxen","kind":["wf"],"lemmas":["ox%1","oxen%1"],"senses":["oxen%1:05:00::"],"tag":"man","pos":"NNS","sep":""},{"form":",","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"sheep","kind":["wf"],"lemmas":["sheep%1"],"tag":"un","pos":"NNS","sep":""},{"form":",","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"goats","kind":["wf"],"lemmas":["goat%1"],"senses":["goat%1:05:00::"],"tag":"man","pos":"NNS","sep":""},{"kind":["def"],"action":"close"}],"keys":["family_bovidae%1:05:00::","bovidae%1:05:00::"],"ofs":"02400139","terms":["family Bovidae","Bovidae"]}
{"_id":"noun.animal-2396427","doc_id":"noun.animal","type":"n","sent_id":2396427,"text":"Old World wild swine having a narrow body and prominent tusks from which most domestic swine come; introduced in United States","tokens":[{"kind":["def"],"action":"open"},{"glob":"auto","kind":["glob","a"],"lemmas":["Old_World%1"],"senses":["old_world%1:15:00::"],"tag":"auto"},{"form":"Old","kind":["cf","a"],"lemmas":["Old%3"],"tag":"un","pos":"JJ"},{"form":"World","kind":["cf","a"],"lemmas":["World"],"tag":"un","pos":"NNP"},{"form":"wild","kind":["wf"],"lemmas":["wild%1","wild%3","wild%4"],"senses":["wild%3:00:01::"],"tag":"man","pos":"JJ"},{"form":"swine","kind":["wf"],"lemmas":["swine%1"],"senses":["swine%1:05:00::"],"tag":"auto","pos":"NNS"},{"form":"having","kind":["wf"],"lemmas":["have%2"],"tag":"un","pos":"VBG"},{"form":"a","kind":["wf"],"lemmas":["a"],"tag":"ignore","pos":"DT"},{"form":"narrow","kind":["wf"],"lemmas":["narrow%1","narrow%2","narrow%3"],"tag":"un","pos":"JJ"},{"form":"body","kind":["wf"],"lemmas":["body%1","body%2"],"senses":["body%1:08:01::"],"tag":"man","pos":"NN"},{"form":"and","kind":["wf"],"lemmas":["and"],"tag":"ignore","pos":"CC"},{"form":"prominent","kind":["wf"],"lemmas":["prominent%3"],"senses":["prominent%5:00:00:conspicuous:00"],"tag":"man","pos":"JJ"},{"form":"tusks","kind":["wf"],"lemmas":["tusk%1","tusk%2"],"senses":["tusk%1:05:00::"],"tag":"man","pos":"NNS"},{"form":"from","kind":["wf"],"lemmas":["from"],"tag":"ignore","pos":"IN"},{"form":"which","kind":["wf"],"lemmas":["which"],"tag":"ignore","pos":"WDT"},{"form":"most","kind":["wf"],"lemmas":["many%3","most%3","much%3","most%4","much%4"],"tag":"un","pos":"DT"},{"form":"domestic","kind":["wf"],"lemmas":["domestic%1","domestic%3"],"senses":["domestic%5:00:00:tame:01"],"tag":"man","pos":"JJ"},{"form":"swine","kind":["wf"],"lemmas":["swine%1"],"senses":["swine%1:05:00::"],"tag":"auto","pos":"NNS"},{"form":"come","kind":["wf"],"lemmas":["come%1","come%2"],"tag":"un","pos":"VBP","sep":""},{"form":";","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"introduced","kind":["wf"],"lemmas":["introduce%2"],"senses":["introduce%2:36:01::"],"tag":"man","pos":"VBN"},{"form":"in","kind":["wf"],"lemmas":["in"],"tag":"ignore","pos":"IN"},{"glob":"auto","kind":["glob","b"],"lemmas":["United_States%1"],"tag":"un"},{"form":"United","kind":["cf","b"],"lemmas":["United"],"tag":"un","pos":"NNP"},{"form":"States","kind":["cf","b"],"lemmas":["State%1"],"tag":"un","pos":"NNP","sep":""},{"kind":["def"],"action":"close"}],"keys":["sus_scrofa%1:05:01::","boar%1:05:02::","wild_boar%1:05:00::"],"ofs":"02396427","terms":["Sus scrofa","boar","wild boar"]}
{"_id":"noun.animal-2396157","doc_id":"noun.animal","type":"n","sent_id":2396157,"text":"a mongrel hog with a thin body and long legs and a ridged back; a wild or semi-wild descendant of improved breeds; found chiefly in the southeastern United States","tokens":[{"kind":["def"],"action":"open"},{"form":"a","kind":["wf"],"lemmas":["a"],"tag":"ignore","pos":"DT"},{"form":"mongrel","kind":["wf"],"lemmas":["mongrel%1"],"senses":["mongrel%1:06:00::"],"tag":"man","pos":"NN"},{"form":"hog","kind":["wf"],"lemmas":["hog%1","hog%2"],"senses":["hog%1:05:00::"],"tag":"man","pos":"NN"},{"form":"with","kind":["wf"],"lemmas":["with"],"tag":"ignore","pos":"IN"},{"form":"a","kind":["wf"],"lemmas":["a"],"tag":"ignore","pos":"DT"},{"form":"thin","kind":["wf"],"lemmas":["thin%2","thin%3","thin%4"],"tag":"un","pos":"JJ"},{"form":"body","kind":["wf"],"lemmas":["body%1","body%2"],"senses":["body%1:08:01::"],"tag":"man","pos":"NN"},{"form":"and","kind":["wf"],"lemmas":["and"],"tag":"ignore","pos":"CC"},{"form":"long","kind":["wf"],"lemmas":["long%2","long%3","long%4"],"senses":["long%3:00:01::"],"tag":"man","pos":"JJ"},{"form":"legs","kind":["wf"],"lemmas":["leg%1","legs%1"],"senses":["leg%1:08:02::"],"tag":"man","pos":"NNS"},{"form":"and","kind":["wf"],"lemmas":["and"],"tag":"ignore","pos":"CC"},{"form":"a","kind":["wf"],"lemmas":["a"],"tag":"ignore","pos":"DT"},{"form":"ridged","kind":["wf"],"lemmas":["ridge%2","ridged%3"],"senses":["ridged%3:44:00::"],"tag":"man","pos":"JJ"},{"form":"back","kind":["wf"],"lemmas":["back%1","back%2","back%3","back%4"],"senses":["back%1:08:00::"],"tag":"man","pos":"NN","sep":""},{"form":";","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"a","kind":["wf"],"lemmas":["a"],"tag":"ignore","pos":"DT"},{"form":"wild","kind":["wf"],"lemmas":["wild%1","wild%3","wild%4"],"senses":["wild%3:00:01::"],"tag":"man","pos":"JJ"},{"form":"or","kind":["wf"],"lemmas":["or"],"tag":"ignore","pos":"CC"},{"form":"semi-wild","kind":["wf"],"lemmas":["semi-wild%3"],"senses":["semi-wild%5:00:00:wild:01"],"tag":"auto","pos":"JJ"},{"form":"descendant","kind":["wf"],"lemmas":["descendant%1","descendant%3"],"senses":["descendant%1:18:00::"],"tag":"man","pos":"NN"},{"form":"of","kind":["wf"],"lemmas":["of"],"tag":"ignore","pos":"IN"},{"form":"improved","kind":["wf"],"lemmas":["improve%2","improved%3"],"senses":["improved%3:00:00::"],"tag":"man","pos":"JJ"},{"form":"breeds","kind":["wf"],"lemmas":["breed%1","breed%2"],"tag":"un","pos":"NNS","sep":""},{"form":";","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"found","kind":["wf"],"lemmas":["found%1","find%2","found%2","found%3"],"tag":"un","pos":"VBN"},{"form":"chiefly","kind":["wf"],"lemmas":["chiefly%4"],"senses":["chiefly%4:02:00::"],"tag":"auto","pos":"RB"},{"form":"in","kind":["wf"],"lemmas":["in"],"tag":"ignore","pos":"IN"},{"form":"the","kind":["wf"],"lemmas":["the"],"tag":"ignore","pos":"DT"},{"glob":"auto","kind":["glob","a"],"lemmas":["southeastern_United_States%1"],"senses":["southeastern_united_states%1:15:00::"],"tag":"auto"},{"form":"southeastern","kind":["cf","a"],"lemmas":["southeastern%3"],"tag":"un","pos":"JJ"},{"form":"United","kind":["cf","a"],"lemmas":["United"],"tag":"un","pos":"NNP"},{"form":"States","kind":["cf","a"],"lemmas":["State%1"],"tag":"un","pos":"NNP","sep":""},{"kind":["def"],"action":"close"}],"keys":["razorbacked_hog%1:05:00::","razorback_hog%1:05:00::","razorback%1:05:01::"],"ofs":"02396157","terms":["razorbacked hog","razorback hog","razorback"]}
{"_id":"noun.animal-2399942","doc_id":"noun.animal","type":"n","sent_id":2399942,"text":"the fourth compartment of the stomach of a ruminant; the one where digestion takes place","tokens":[{"kind":["def"],"action":"open"},{"form":"the","kind":["wf"],"lemmas":["the"],"tag":"ignore","pos":"DT"},{"form":"fourth","kind":["wf"],"lemmas":["fourth%1","fourth%3","fourth%4"],"tag":"un","pos":"JJ"},{"form":"compartment","kind":["wf"],"lemmas":["compartment%1"],"senses":["compartment%1:06:00::"],"tag":"man","pos":"NN"},{"form":"of","kind":["wf"],"lemmas":["of"],"tag":"ignore","pos":"IN"},{"form":"the","kind":["wf"],"lemmas":["the"],"tag":"ignore","pos":"DT"},{"form":"stomach","kind":["wf"],"lemmas":["stomach%1","stomach%2"],"senses":["stomach%1:08:00::"],"tag":"man","pos":"NN"},{"form":"of","kind":["wf"],"lemmas":["of"],"tag":"ignore","pos":"IN"},{"form":"a","kind":["wf"],"lemmas":["a"],"tag":"ignore","pos":"DT"},{"form":"ruminant","kind":["wf"],"lemmas":["ruminant%1","ruminant%3"],"senses":["ruminant%1:05:00::"],"tag":"man","pos":"NN","sep":""},{"form":";","kind":["wf"],"tag":"ignore","pos":"punc"},{"form":"the","kind":["wf"],"lemmas":["the"],"tag":"ignore","pos":"DT"},{"form":"one","kind":["wf"],"lemmas":["one"],"tag":"ignore","pos":"CD","type":"num"},{"form":"where","kind":["wf"],"lemmas":["where"],"tag":"ignore","pos":"WRB"},{"form":"digestion","kind":["wf"],"lemmas":["digestion%1"],"senses":["digestion%1:22:00::"],"tag":"man","pos":"NN"},{"glob":"man","kind":["glob","a"],"lemmas":["take_place%2"],"senses":["take_place%2:30:00::"],"tag":"man"},{"form":"takes","kind":["cf","a"],"lemmas":["take%1","take%2"],"tag":"un","pos":"VBZ"},{"form":"place","kind":["cf","a"],"lemmas":["place%1","place%2"],"tag":"un","pos":"NN","sep":""},{"kind":["def"],"action":"close"}],"keys":["fourth_stomach%1:05:00::","abomasum%1:05:00::"],"ofs":"02399942","terms":["fourth stomach","abomasum"]}
Expand Down
83 changes: 83 additions & 0 deletions scripts/issue-33.lisp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@

(ql:quickload '(:cl-ppcre :yason :edit-distance :serapeum))


;; read WordNet Files

(defun read-index (dict-folder)
(let ((db (make-hash-table :test #'equal)))
(with-open-file (in (make-pathname :name "index.sense" :defaults dict-folder))
(loop for line = (read-line in nil nil)
while line
do (let ((reg (cl-ppcre:split " " line)))
(push (cdr reg) (gethash (car reg) db '())))
finally (return db)))))

(defun proc-db-line (str dict)
(destructuring-bind (data gloss)
(cl-ppcre:split "\\|" str)
(let* ((reg (cl-ppcre:split " " data))
(ssi (format nil "~a-~a" (car reg) (caddr reg))))
(setf (gethash ssi dict)
(string-trim '(#\Space) gloss)))))

(defun read-wordnet (dict-folder)
(let ((dict (make-hash-table :test #'equal))
(files '("noun" "adv" "adj" "verb")))
(dolist (fn files dict)
(with-open-file (in (make-pathname :name "data" :type fn :defaults dict-folder))
(loop for line = (read-line in nil nil)
while line
when (cl-ppcre:scan "^[0-9]" line)
do (proc-db-line line dict))))))


(defun read-jl-file (fn)
(with-open-file (in fn)
(loop for line = (read-line in nil nil)
while line
collect (yason:parse line))))


(defun text-from-tokens (obj)
(with-output-to-string (s)
(dolist (tk (gethash "tokens" obj))
(format s "~a~a" (gethash "form" tk "") (if (gethash "form" tk) (gethash "sep" tk " ") "")))))


; basic validations: 1) detokenization of tokens match the text; and
; 2) keys are all valid sense-keys in WN30.

(defun main-0 ()
(let ((wn (read-index #P"~/work/wn/WordNet-3.0/dict/")))
(dolist (fn (directory "data/ann/annotation-??.jl"))
(dolist (obj (read-jl-file fn))
(cond
((not (equal (text-from-tokens obj) (gethash "text" obj)))
(format t "~a~%" obj))

((not (every (lambda (s) (gethash s wn)) (gethash "keys" obj)))
(format t "~a not in wn~%" (gethash "keys" obj))))))))


;; for all cases where a gloss is repeated, all tokens for the
;; repetions are the same?

(defun is-same (txt tks1 tks2)
(loop for t1 in (mapcar #'alexandria:hash-table-alist tks1)
for t2 in (mapcar #'alexandria:hash-table-alist tks2)
do (when (not (equal (sort t1 #'string<= :key #'car) (sort t2 #'string<= :key #'car)))
(format t "D ~a~% ~a~% ~a~%" txt t1 t2))))

(defun main-1 ()
(let ((db (make-hash-table :test #'equal)))
(dolist (fn (directory "data/ann/annotation-??.jl"))
(loop for obj in (read-jl-file fn)
for txt = (gethash "text" obj)
do (let ((tks (gethash txt db nil)))
;; (format t "processing ~a:~a~%" fn (gethash "ofs" obj))
(if tks
(is-same txt (gethash "tokens" obj) tks)
(setf (gethash txt db)
(gethash "tokens" obj))))))))

83 changes: 83 additions & 0 deletions scripts/issue-34.lisp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@

(ql:quickload '(:cl-ppcre :yason :edit-distance :serapeum))


;; read WordNet Files

(defun read-index (dict-folder)
(let ((db (make-hash-table :test #'equal)))
(with-open-file (in (make-pathname :name "index.sense" :defaults dict-folder))
(loop for line = (read-line in nil nil)
while line
do (let ((reg (cl-ppcre:split " " line)))
(push (cdr reg) (gethash (car reg) db '())))
finally (return db)))))

(defun proc-db-line (str dict)
(destructuring-bind (data gloss)
(cl-ppcre:split "\\|" str)
(let* ((reg (cl-ppcre:split " " data))
(ssi (format nil "~a-~a" (car reg) (caddr reg))))
(setf (gethash ssi dict)
(string-trim '(#\Space) gloss)))))

(defun read-wordnet (dict-folder)
(let ((dict (make-hash-table :test #'equal))
(files '("noun" "adv" "adj" "verb")))
(dolist (fn files dict)
(with-open-file (in (make-pathname :name "data" :type fn :defaults dict-folder))
(loop for line = (read-line in nil nil)
while line
when (cl-ppcre:scan "^[0-9]" line)
do (proc-db-line line dict))))))


(defun read-jl-file (fn)
(with-open-file (in fn)
(loop for line = (read-line in nil nil)
while line
collect (yason:parse line))))


(defun text-from-tokens (obj)
(with-output-to-string (s)
(dolist (tk (gethash "tokens" obj))
(format s "~a~a" (gethash "form" tk "") (if (gethash "form" tk) (gethash "sep" tk " ") "")))))


; basic validations: 1) detokenization of tokens match the text; and
; 2) keys are all valid sense-keys in WN30.

(defun main-0 ()
(let ((wn (read-index #P"~/work/wn/WordNet-3.0/dict/")))
(dolist (fn (directory "data/ann/annotation-??.jl"))
(dolist (obj (read-jl-file fn))
(cond
((not (equal (text-from-tokens obj) (gethash "text" obj)))
(format t "~a~%" obj))

((not (every (lambda (s) (gethash s wn)) (gethash "keys" obj)))
(format t "~a not in wn~%" (gethash "keys" obj))))))))


;; for all cases where a gloss is repeated, all tokens for the
;; repetions are the same?

(defun is-same (txt tks1 tks2)
(loop for t1 in (mapcar #'alexandria:hash-table-alist tks1)
for t2 in (mapcar #'alexandria:hash-table-alist tks2)
do (when (not (equal (sort t1 #'string<= :key #'car) (sort t2 #'string<= :key #'car)))
(format t "D ~a~% ~a~% ~a~%" txt t1 t2))))

(defun main-1 ()
(let ((db (make-hash-table :test #'equal)))
(dolist (fn (directory "data/ann/annotation-??.jl"))
(loop for obj in (read-jl-file fn)
for txt = (gethash "text" obj)
do (let ((tks (gethash txt db nil)))
;; (format t "processing ~a:~a~%" fn (gethash "ofs" obj))
(if tks
(is-same txt (gethash "tokens" obj) tks)
(setf (gethash txt db)
(gethash "tokens" obj))))))))

Loading

0 comments on commit 715a40d

Please sign in to comment.