chartbeat-labs · afriedman412 · May 1, 2023 · May 1, 2023 · May 15, 2023 · Jun 15, 2023
diff --git a/src/textacy/constants.py b/src/textacy/constants.py
@@ -21,6 +21,51 @@
 OBJ_DEPS: set[str] = {"attr", "dobj", "dative", "oprd"}
 AUX_DEPS: set[str] = {"aux", "auxpass", "neg"}
 
+MIN_QUOTE_LENGTH: int=4
+
+QUOTATION_MARK_PAIRS = {
+    # """
+    # Ordinal points of the token.is_quote characters, matched up by start and end.
+
+    # Some of these pairs are from weirdly formatted newspaper uploads, so could be some noise!
+
+    # source:
+    # switch = "\"\'"
+    # start = "“‘```“‘«‹「『„‚"
+    # end = "”’’’’”’»›」』”’"
+    # """
-    # """
-    # Ordinal points of the token.is_quote characters, matched up by start and end.
-
-    # Some of these pairs are from weirdly formatted newspaper uploads, so could be some noise!
-
-    # source:
-    # switch = "\"\'"
-    # start = "“‘```“‘«‹「『„‚"
-    # end = "”’’’’”’»›」』”’"
-    # """
-    # """
-    # Ordinal points of the token.is_quote characters, matched up by start and end.
-
-    # Some of these pairs are from weirdly formatted newspaper uploads, so could be some noise!
-
-    # source:
-    # switch = "\"\'"
-    # start = "“‘```“‘«‹「『„‚"
-    # end = "”’’’’”’»›」』”’"
-    # """
+    (34, 34),       # " "
+    (39, 39),       # ' '
+    (96, 8217),     # ` ’
+    (171, 187),     # « »
+    (8216, 8217),   # ‘ ’
+    (8218, 8217),   # ‚ ’
+    (8220, 8221),   # “ ”
+    (8222, 8221),   # „ ”
+    (8249, 8250),   # ‹ ›
+    (12300, 12301), #「 」 
+    (12302, 12303), #『 』
+    (8220, 34),     # “ "
+    (8216, 34),     # ‘ "
+    (96, 34),       # ` "
+    (8216, 34),     # ‘ "
+    (171, 34),      # « "
+    (8249, 34),     # ‹ "
+    (12300, 34),    #「 "
+    (12302, 34),    #『 "
+    (8222, 34),     # „ "
+    (8218, 34),     # ‚ "
+    (34, 8221),      # " ”
+    (34, 8217), # " ’
+    (34, 10),
+    (39, 10),
+    (96, 10),
+    (171, 10),
+    (8216, 10),
+    (8218, 10),
+    (8249, 10)
+    }
+
-
+"""
+Ordinal points of the token.is_quote characters, matched up by start and end.
+Some of these pairs are from weirdly formatted newspaper uploads, so could be some noise!
+
+source:
+switch = "\"\'"
+start = "“‘```“‘«‹「『„‚"
+end = "”’’’’”’»›」』”’"
+"""
+
-
+"""
+Ordinal points of the token.is_quote characters, matched up by start and end.
+Some of these pairs are from weirdly formatted newspaper uploads, so could be some noise!
+
+source:
+switch = "\"\'"
+start = "“‘```“‘«‹「『„‚"
+end = "”’’’’”’»›」』”’"
+"""
+
 REPORTING_VERBS: dict[str, set[str]] = {
     "en": {
         "according",
@@ -196,3 +241,9 @@
 )
 
 RE_ALNUM: Pattern = re.compile(r"[^\W_]+")
+
+# regexes for quote detection prep
+ALL_QUOTES = '‹「`»」‘"„›”‚’\'』『«“'
+DOUBLE_QUOTES = '‹「」»"„『”‚』›«“'
+ANY_DOUBLE_QUOTE_REGEX = r"[{}]".format(DOUBLE_QUOTES)
+DOUBLE_QUOTES_NOSPACE_REGEX = r"(?<=\S)([{}])(?=\S)".format(DOUBLE_QUOTES)
diff --git a/src/textacy/extract/triples.py b/src/textacy/extract/triples.py
@@ -9,12 +9,12 @@
 
 import collections
 from operator import attrgetter
-from typing import Iterable, Mapping, Optional, Pattern
+from typing import Iterable, Mapping, Optional, Pattern, Literal
-from typing import Iterable, Mapping, Optional, Pattern, Literal
+from typing import Iterable, Literal, Mapping, Optional, Pattern
-from typing import Iterable, Mapping, Optional, Pattern, Literal
+from typing import Iterable, Literal, Mapping, Optional, Pattern
 
-from cytoolz import itertoolz
 from spacy.symbols import (
     AUX,
     VERB,
+    PUNCT,
     agent,
     attr,
     aux,
@@ -27,9 +27,10 @@
     nsubjpass,
     obj,
     pobj,
-    xcomp,
+    xcomp
-    xcomp
+    xcomp,
-    xcomp
+    xcomp,
 )
 from spacy.tokens import Doc, Span, Token
+import regex as re
 
 from .. import constants, types, utils
 from . import matches
@@ -209,6 +210,7 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]:
 
     Args:
         doc
+        min_quote_length - minimum distance (in tokens) between potentially paired quotation marks.
 
     Yields:
         Next direct quotation in ``doc`` as a (speaker, cue, content) triple.
@@ -217,86 +219,83 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]:
         Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
         Tagging of Reported Speech in Newspaper Articles".
     """
-    # TODO: train a model to do this instead, maybe similar to entity recognition
     try:
         _reporting_verbs = constants.REPORTING_VERBS[doc.lang_]
     except KeyError:
         raise ValueError(
             f"direct quotation extraction is not implemented for lang='{doc.lang_}', "
             f"only {sorted(constants.REPORTING_VERBS.keys())}"
         )
-    qtok_idxs = [tok.i for tok in doc if tok.is_quote]
-    if len(qtok_idxs) % 2 != 0:
-        raise ValueError(
-            f"{len(qtok_idxs)} quotation marks found, indicating an unclosed quotation; "
-            "given the limitations of this method, it's safest to bail out "
-            "rather than guess which quotation is unclosed"
-        )
-    qtok_pair_idxs = list(itertoolz.partition(2, qtok_idxs))
-    for qtok_start_idx, qtok_end_idx in qtok_pair_idxs:
-        content = doc[qtok_start_idx : qtok_end_idx + 1]
+    # pairs up quotation-like characters based on acceptable start/end combos
+    # see constants for more info
+    qtoks = [tok for tok in doc if tok.is_quote or (re.match(r"\n", tok.text))]
+    qtok_idx_pairs = [(-1,-1)]
+    for n, q in enumerate(qtoks):
+        if (
+            not bool(q.whitespace_)
+            and q.i not in [q_[1] for q_ in qtok_idx_pairs] 
+            and q.i > qtok_idx_pairs[-1][1]
+            ):
+            for q_ in qtoks[n+1:]:
+                if (ord(q.text), ord(q_.text)) in constants.QUOTATION_MARK_PAIRS:
+                    qtok_idx_pairs.append((q.i, q_.i))
+                    break  
+    qtok_idx_pairs = qtok_idx_pairs[1:]
+
+    def filter_quote_tokens(tok):
+        return any(qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_idx_pairs)
+
+    for qtok_start_idx, qtok_end_idx in qtok_idx_pairs:
+        content = doc[qtok_start_idx : qtok_end_idx]
         cue = None
         speaker = None
-        # filter quotations by content
+
         if (
-            # quotations should have at least a couple tokens
-            # excluding the first/last quotation mark tokens
-            len(content) < 4
+            len(content.text.split()) < constants.MIN_QUOTE_LENGTH
             # filter out titles of books and such, if possible
             or all(
                 tok.is_title
                 for tok in content
-                # if tok.pos in {NOUN, PROPN}
                 if not (tok.is_punct or tok.is_stop)
             )
-            # TODO: require closing punctuation before the quotation mark?
-            # content[-2].is_punct is False
         ):
             continue
-        # get window of adjacent/overlapping sentences
-        window_sents = (
-            sent
-            for sent in doc.sents
-            # these boundary cases are a subtle bit of work...
-            if (
-                (sent.start < qtok_start_idx and sent.end >= qtok_start_idx - 1)
-                or (sent.start <= qtok_end_idx + 1 and sent.end > qtok_end_idx)
-            )
-        )
+
+        for window_sents in [windower(content, "overlap"), windower(content, "linebreaks")]:
         # get candidate cue verbs in window
-        cue_cands = [
-            tok
-            for sent in window_sents
-            for tok in sent
-            if (
-                tok.pos == VERB
-                and tok.lemma_ in _reporting_verbs
-                # cue verbs must occur *outside* any quotation content
-                and not any(
-                    qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_pair_idxs
-                )
+            cue_candidates = [
+                    tok
+                    for sent in window_sents
+                    for tok in sent
+                    if tok.pos == VERB 
+                    and tok.lemma_ in _reporting_verbs
+                    and not filter_quote_tokens(tok)
+                ]
+            cue_candidates = sorted(cue_candidates,
+                key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx))
             )
-        ]
-        # sort candidates by proximity to quote content
-        cue_cands = sorted(
-            cue_cands,
-            key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)),
-        )
-        for cue_cand in cue_cands:
-            if cue is not None:
-                break
-            for speaker_cand in cue_cand.children:
-                if speaker_cand.dep in _ACTIVE_SUBJ_DEPS:
-                    cue = expand_verb(cue_cand)
-                    speaker = expand_noun(speaker_cand)
+            for cue_cand in cue_candidates:
+                if cue is not None:
                     break
-        if content and cue and speaker:
-            yield DQTriple(
-                speaker=sorted(speaker, key=attrgetter("i")),
-                cue=sorted(cue, key=attrgetter("i")),
-                content=content,
-            )
-
+                speaker_cands = [
+                    speaker_cand for speaker_cand in cue_cand.children
+                    if speaker_cand.pos!=PUNCT
+                    and not filter_quote_tokens(speaker_cand)
+                    and ((speaker_cand.i >= qtok_end_idx) 
+                        or (speaker_cand.i <= qtok_start_idx ))
+                ]
+                for speaker_cand in speaker_cands:
+                    if speaker_cand.dep in _ACTIVE_SUBJ_DEPS:
+                        cue = expand_verb(cue_cand)
+                        speaker = expand_noun(speaker_cand)
+                        break
+            if content and cue and speaker:
+                yield DQTriple(
+                    speaker=sorted(speaker, key=attrgetter("i")),
+                    cue=sorted(cue, key=attrgetter("i")),
+                    content=doc[qtok_start_idx:qtok_end_idx+1],
+                )
+                break
 
 def expand_noun(tok: Token) -> list[Token]:
     """Expand a noun token to include all associated conjunct and compound nouns."""
@@ -305,15 +304,105 @@ def expand_noun(tok: Token) -> list[Token]:
         child
         for tc in tok_and_conjuncts
         for child in tc.children
-        # TODO: why doesn't compound import from spacy.symbols?
         if child.dep_ == "compound"
     ]
     return tok_and_conjuncts + compounds
 
-
 def expand_verb(tok: Token) -> list[Token]:
     """Expand a verb token to include all associated auxiliary and negation tokens."""
     verb_modifiers = [
         child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS
     ]
     return [tok] + verb_modifiers
+
+def windower(quote: Span, method: Literal["overlap", "linebreaks"]):
+    """
+    Finds the range of sentences in which to look for quote attribution.
+
+    3 ways:
+    - "overlap": any sentences that overlap with the quote span
+    - "linebreaks": overlap sentences +/- one sentence, without crossing linebreaks after the quote
+    - None: overlap sentences +/- one sentence,
+
+    Input:
+        quote (Span) - quote to be attributed
+        method (str) - how the sentence range will be determined
+
+    Output:
+        sents (list) - list of sentences
+    """
+    if method == "overlap":
+        return [
+                sent for sent in quote.doc.sents
+                if (sent.start < quote.start < sent.end)
+                or (sent.start < quote.end < sent.end)
+            ]
+    else:
+        sent_indexes = [
+            n for n, s in enumerate(quote.doc.sents) 
+            if (s.start <= quote.start <= s.end) 
+            or (s.start <= quote.end <= s.end)
+            ]
+
+        i_sent = sent_indexes[0] - 1 if sent_indexes[0] > 0 else 0
+        j_sent = sent_indexes[-1]+2
+        sents = list(quote.doc.sents)[i_sent:j_sent]
+        if method == "linebreaks":
+            linebreaks = [0] + [tok.i for tok in quote.doc if re.match(r"\n", tok.text)] + [quote.doc[-1].i]
+            linebreak_limits = [
+                lb for lb in linebreaks
+                if sents[0].start < lb <= quote.end + 1
+                ]
+            if linebreak_limits:
+                return [s for s in sents if s.end <= max(linebreak_limits)]
+        return sents  
+
+def prep_text_for_quote_detection(t: str, fix_plural_possessives: bool=True) -> str:
+    """
+    Sorts out some common issues that trip up the quote detector. Works best one paragraph at a time -- use prep_document_for_quote_detection for the whole doc.
+
+    - replaces consecutive apostrophes with a double quote (no idea why this happens but it does)
+    - adds spaces before or after double quotes that don't have them
+    - if enabled, fixes plural possessives by adding an "x", because the hanging apostrophe can trigger quote detection. 
+    - adds a double quote to the end of paragraphs that are continuations of quotes and thus traditionally don't end with quotation marks
+
+    Input:
+        t (str) - text to be prepped, preferably one paragraph
+        fix_plural_possessives (bool) - enables fix_plural_possessives
+
+    Output:
+        t (str) - text prepped for quote detection
+    """
+    if not t:
+        return
+
+    t = t.replace("\'\'", "\"")
+    if fix_plural_possessives:
+        t = re.sub(r"(.{3,8}s\')(\s)", r"\1x\2", t)
+    while re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p):
+        match = re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p)
+        if len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[:match.start()])) % 2 != 0:
+            replacer = '" '
+        else:
+            replacer = ' "'
+        p = p[:match.start()] + replacer + p[match.end():]
+    if (
+        not (p[0] == "'" and p[-1] == "'") 
+        and p[0] in constants.ALL_QUOTES 
+        and len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[1:])) % 2 == 0
+        ):
+        p += '"'
+    return p.strip()
+
+def prep_document_for_quote_detection(t: str, para_char: str="\n") -> str:
+    """
+    Splits text into paragraphs (on para_char), runs prep_text_for_quote_detection on all paragraphs, then reassembles with para_char.
+
+    Input:
+        t (str) - document to prep for quote detection
+        para_char (str) - paragraph boundary in t
+
+    Output:
+        document prepped for quote detection
+    """
+    return para_char.join([prep_text_for_quote_detection(t) for t in t.split(para_char) if t])