From 77415441424a035b3416db64ee6760c3a902e440 Mon Sep 17 00:00:00 2001 From: Jonathan Sears Date: Tue, 30 Apr 2024 16:09:19 -0500 Subject: [PATCH] slides --- app/app.py | 6 +- docs/_posts/0000-01-01-intro.md | 6 +- docs/_posts/0000-01-02-overview.md | 16 +- docs/_posts/0000-01-03-next.md | 12 +- docs/_posts/0000-01-04-approach.md | 12 ++ docs/_posts/0000-01-04-conclusion.md | 6 - .../0000-01-05-dataset-and-methodology.md | 7 + docs/_posts/0000-01-06-Experiments.md | 6 + docs/_posts/0000-01-07-Experiments-hybrid.md | 0 docs/_posts/0000-01-08-Experiments-bow.md | 0 docs/assets/img/claudette.png | Bin 0 -> 3768 bytes notebooks/Experiments.ipynb | 152 +++++++++++------- 12 files changed, 134 insertions(+), 89 deletions(-) create mode 100644 docs/_posts/0000-01-04-approach.md delete mode 100644 docs/_posts/0000-01-04-conclusion.md create mode 100644 docs/_posts/0000-01-05-dataset-and-methodology.md create mode 100644 docs/_posts/0000-01-06-Experiments.md create mode 100644 docs/_posts/0000-01-07-Experiments-hybrid.md create mode 100644 docs/_posts/0000-01-08-Experiments-bow.md create mode 100644 docs/assets/img/claudette.png diff --git a/app/app.py b/app/app.py index 0fc01d9..738564f 100644 --- a/app/app.py +++ b/app/app.py @@ -119,13 +119,13 @@ def predict_bow(): preds = [] data = request.get_json(force=True) texts = data['text'] - print(texts) + # print(texts) preprocessed_text = [preprocess(text, n=2) for text in texts.split('.')] texts_joined = [' '.join(text) for text in preprocessed_text] - print(texts_joined) + # print(texts_joined) vectorized_text = vectorizer.transform(texts_joined) preds = bow_model.predict(vectorized_text) - print(preds) + # print(preds) return jsonify(prediction=preds.tolist(),text=texts.split('.')) return None diff --git a/docs/_posts/0000-01-01-intro.md b/docs/_posts/0000-01-01-intro.md index 5b0f580..5f13ff4 100644 --- a/docs/_posts/0000-01-01-intro.md +++ b/docs/_posts/0000-01-01-intro.md @@ -1,6 +1,8 @@ --- layout: slide -title: "NLP Project" +title: "Using Natural Language Processing to Identify Unfair Clauses in Terms and Conditions Documents" --- -Use the right arrow to begin! +**Authors:** Jonathan Sears, Nick Radwin +**Institution:** Tulane University +**Emails:** jsears1@tuane.edu, nradwin@tulane.edu diff --git a/docs/_posts/0000-01-02-overview.md b/docs/_posts/0000-01-02-overview.md index 88478db..523c3e9 100644 --- a/docs/_posts/0000-01-02-overview.md +++ b/docs/_posts/0000-01-02-overview.md @@ -1,19 +1,9 @@ --- layout: slide -title: "Equations and Tables" +title: "Introduction" --- -Here is an inline equation: $\sum_{i=1}^n i = ?$ +## Introduction -And a block one: - -$$e = mc^2$$ - - -Here is a table: - -| header 1 | header 2 | -|----------|----------| -| value 1 | value 2 | -| value 3 | value 4 | +Despite their ubiquity, terms and conditions are seldom read by users, leading to widespread ignorance about potentially exploitative or unfair clauses. Our project aims to bring these hidden clauses to light by using a sentence level text classifier that labels clauses as either exploitative (1) or non exploitative(0). We based these labels off of categories as outlined in a prior paper we will discuss shortly. diff --git a/docs/_posts/0000-01-03-next.md b/docs/_posts/0000-01-03-next.md index a10b1ad..37394a9 100644 --- a/docs/_posts/0000-01-03-next.md +++ b/docs/_posts/0000-01-03-next.md @@ -1,13 +1,9 @@ --- layout: slide -title: "Images" +title: "Related Work" --- +Our experiments are Primarily based off of **CLAUDETTE** a research project conducted at Stanford in 2018. +They ultimately used an ensemble method, combining SVMs with LSTMs ,and CNNs, to achieve accuracy and f1-scores above .8. This was our target for this project. -Two ways to add an image. - -Note that the image is in the assets/img folder. - - - -![tulane](assets/img/tulane.png) +![claudette](assets/img/claudette.png) diff --git a/docs/_posts/0000-01-04-approach.md b/docs/_posts/0000-01-04-approach.md new file mode 100644 index 0000000..d7262fa --- /dev/null +++ b/docs/_posts/0000-01-04-approach.md @@ -0,0 +1,12 @@ +--- +layout: slide +title: "Approach" +--- + +We employed multiple machine learning approaches to address the challenge of identifying unfair clauses: +- **BERT models:** Utilized for their deep contextual representations. +- **Bag of Words (BoW):** Simplified text representation focusing on term frequencies. +- **Support Vector Machine (SVM):** Tested for its capability to establish a clear decision boundary. +- **Convolutional Neural Network (CNN):** Explored for its pattern recognition capabilities within text data. +- **Gradient Boosting Machine (GBM):** Chosen for its robustness and iterative improvement on classification tasks. +- **Hybrid BERT/BoW model:** An attempt to combine the strengths of BERT and BoW models. diff --git a/docs/_posts/0000-01-04-conclusion.md b/docs/_posts/0000-01-04-conclusion.md deleted file mode 100644 index 76f0dc2..0000000 --- a/docs/_posts/0000-01-04-conclusion.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -layout: slide -title: "Conclusions" ---- - -Hi there diff --git a/docs/_posts/0000-01-05-dataset-and-methodology.md b/docs/_posts/0000-01-05-dataset-and-methodology.md new file mode 100644 index 0000000..2f56da5 --- /dev/null +++ b/docs/_posts/0000-01-05-dataset-and-methodology.md @@ -0,0 +1,7 @@ +--- +layout: slide +title: "Dataset and Metrics" +--- +- **Dataset:** Consisted of 100 labeled terms and conditions documents, each sentence categorized as either fair or one of nine subcategories of unfair. +- **Binary Classification:** Simplified from multiple to two classes (fair and unfair) to address the dataset's imbalance (92% unfair). +- **Evaluation Metrics:** Precision, recall, and F1 score, with models trained on an evenly distributed sample for fairness in performance evaluation. diff --git a/docs/_posts/0000-01-06-Experiments.md b/docs/_posts/0000-01-06-Experiments.md new file mode 100644 index 0000000..d70b1e6 --- /dev/null +++ b/docs/_posts/0000-01-06-Experiments.md @@ -0,0 +1,6 @@ +--- +layout: slide +title: "Experiments" +--- +We originally experimented with the more complex BERT representation of the text. The thinking behind this was that the BERT encodings would be able to capture a better understanding of the text both semantically and contextually. We experimented with many different methods of fine tuning BERT, attempting to fine tune a single classifier layer on to of pooled +However we were unable to produce results near that of claudette, with our best variants of the fine tuned BERT model unable to crack an f1-score of .6 \ No newline at end of file diff --git a/docs/_posts/0000-01-07-Experiments-hybrid.md b/docs/_posts/0000-01-07-Experiments-hybrid.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/_posts/0000-01-08-Experiments-bow.md b/docs/_posts/0000-01-08-Experiments-bow.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/assets/img/claudette.png b/docs/assets/img/claudette.png new file mode 100644 index 0000000000000000000000000000000000000000..5d0acb7adeea3e4cc1bcf47824f459bc4e00962f GIT binary patch literal 3768 zcmZXXc{J4D|Ht2o2rX2Dktq6LtRrNt>||$*y=-sUO;Xu~P+4OJL&(yUJ!2b5Lbfrs z&lp{SZ z9k1Upoje|ASi=a%(~*%LLgVP@h@zs>CnM7i0G$Ak2LLSqP!9l&4i1d~@GB$ZEdXEu zfCK<<0N`MKy#xUIm6WEi*h&DX0f03!`Cx6W2mm%mzVFV?9`5W=lagp{?PKBLo5Ldu zHG~m2xB0S)Mj1`U`ZM(pf+u3j6Dj-JlX|L)~IhsXDd ziX|r}qfn@BPELxx!NS5qettfIKp+x{<6~n(iHXDM>EAp&s<2C`CML&61!7}jzDdipA`qQUPL!zVRx>2Y+q;H`r$$(WprriC+`L3xgPfN~ zOiIeGD~&(CB3~VK6@=fL#mtz{zuO_t&gp4myjb=1+gVog1+LPGA}T3Qo<1(CyM^3Z zIy?&PRDO_Tv`b?P9mErZtX9)WR4U0yJO9KFdE{$>c$Sn4l)&6DjTr2OzndQGN%H8E zPbAB#Sy3*hO30!_%WQrIvn)ZQ;ao>^;<|ITk5b6jlB1QvE;q88+7GezPyq(NaS1b;K4SjHxaH= zudEGP-@dW?Mw$*s?SHbB3oNM3@7E-G+b z49=?5+n0Bw>*^fo@Kbtc*&Y`^o6zPMYAsf)NMbu{y8vJ??clP{q&VlabfNLZ?wNLn zyV1Vb_qp+Nop3|<_6t@{Q#;mfZJFU0{39T4I`!sL(ZvC}RfCL7Blx(kO1rg_r8nUNUxyN zdu|ih=;^P%%nNdw&x+3LLLuG?wr;q2(r31w@V>z8zFpGF%op`A*JuNk>oe)H(wNJG z&7~2)7l2RT1qax-vlfjSI@>BrhUEq65h~LZ%2!cmw>t?N%)+#<&2AddOFft~vk2u+ z2w6IcsZ19=cXFHE>NR~1(Lo{JGX}&!pL`PR1AZX3&~xJSS>axP_j2}o%2sdp| zu)iERwQJRGHsO-u3Rd%l&<{(~Mf(I8SqIWrt(~9}-}Ef5punN(C3Xk-r$emlzvZ@H zXFez=XiFnMICji7H8IqfZIi??SSeYLQcqaLkPUY`?q!E^nxgG*G4tn%K7SV$#&S3LBD9(9 zDPb3T(sPRBHTkNW674fv7+}JhpkW5u5i4D+@Y@M#X(7TQJORCh2%?@ZnSyWu$<-yo`UDV7r=h5LxY* zDtLn~xMNe32{%XDax}q4j4b(TVFL}Z_!r{uJRo?y_Q%!@nUy!*#=Xbrr<1q^YAG6+eehq zBHHcLFHB@Vk*IRc)5(;JM@-WU!if~m21ZO2aDMkjM*CraSWX8%X-cA+rKz+i*OIlCy)h1puyE~)vMI_tslJY&)xo9hUOEEID z>*|b&?Zzro>3m~vYj;j}ub;NWoq~RX67-aKZ(+2tqt=Z%jRDc9RY?7nw&ucLH_}fH zUCAm9xWaQr`5cdiZ&$ptv{8v*9(Z-*i4C^WfA=w@c~$#`CD#oOU+5X7@6s;IiE|-4 zw2sV{$02en=B`|fIL~kyda94ZhaRP5?~E{mjT(N#mM<;=SHK5ojqcB7JL;>!y$9?4 z)~L*J2dymB;gM1FZLcG?1*Wy?w``AGJol9Me!7f4VVvWCaTTLRL~6AOR)%CGwNbom znv64@Yerw=v+1&OtFjZ!6GAN;=Q^l@ZEb=HS*7s_Uk@K|%(TpHcg)@N-JI`}khrIG zOTnsMZsbW-&1vDl(==xN5(A{xp#4cl8{C>@=t7ax_OsD6*vnerA^ia0(jJ8k^xqw))NEr~MDPaMrM zkF&+ODlSr4TzpzCqMqYhnZP*t^ThOPxy5i*OGouKkJuOg7TJhU{#JbJ3nBMdqXEu7 zI5+?Q5Ihy|YsVB?M)Myi=id?kWd5-`AH}g54{#pKxsI}bn3=S=Ke~dm<9{H>;~kIf z>Xv_m7xBaY$EsVZEeKvCrrVbpJjXjL`M)AU#8fV5rZ%9ecJ{g-k&PG9HE=LvgW^M4`3xm$bfBN zSp>|a#oE+MVCuENEd^D}u8NR?qOn<}h{FQ?m;B$K*fPCwIQ%5;h`~Z-tMg^Q zr@8rm)x|$=uK%wRiDu>++lb}UF(CyuOFxFne(p*zRBj5G9dHGymSgpQkBS6(P~Bd3 ziR>&l87l?N{`?X8o#QpG(3iQtMW$WQMyIz@yV*K~T zxYxTL-mtJ37o|&^?WWVrx|)Nh-?u(&bVsj7E{8cqq=t!&vR2D(!PRRm=|~I`kwH=A z3~!&WJU{zxtWsqxOO$MNRt|bcicFK?{BXBmAtin`r6kh_jEGo^9lcDx3$_3Z572tN zjesI&@jP!Q`sII%6x)K2o*s8@} z7mhJrQp!K^B@l#ao~(4xFV}-uktzj7otsg&zQQ{Q*q{-p6`QcBgIOwxmQqeiV$tO- z)H^3=dQzhjJ=N15jD%vnRxN*eTa_`yB2cFv##~(I8 MN5eq9RMjs0U(>u=