From a54515330d82030178cbaf5f25a926ba0eefeb66 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Sun, 22 Sep 2024 11:05:01 -0700
Subject: [PATCH] Add some more TLD to the tokenization RE (some of which
 actually get country code TLD after them as well) 
 https://github.com/stanfordnlp/stanza/issues/1423

---
 stanza/models/tokenization/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stanza/models/tokenization/utils.py b/stanza/models/tokenization/utils.py
index 89e32f143..75e510bc4 100644
--- a/stanza/models/tokenization/utils.py
+++ b/stanza/models/tokenization/utils.py
@@ -195,7 +195,7 @@ def process_sentence(sentence, mwt_dict=None):
 
 # https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
 # modification: disallow " as opposed to all ^\s
-URL_RAW_RE = r"""(?:https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s"]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s"]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s"]{2,}|www\.[a-zA-Z0-9]+\.[^\s"]{2,})|[a-zA-Z0-9]+\.com(?:\.[^\s"]{2,})?"""
+URL_RAW_RE = r"""(?:https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s"]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s"]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s"]{2,}|www\.[a-zA-Z0-9]+\.[^\s"]{2,})|[a-zA-Z0-9]+\.(?:gov|org|edu|net|com|co)(?:\.[^\s"]{2,})?)"""
 
 MASK_RE = re.compile(f"(?:{EMAIL_RAW_RE}|{URL_RAW_RE})")