From 5ab42197d7e2196d877e434736023c0d277cf094 Mon Sep 17 00:00:00 2001 From: Mikhail Nefedov Date: Sat, 9 Nov 2024 15:56:04 +0100 Subject: [PATCH] update preprocessing to include LSH, add nltk sentence tokenizer code --- .../preprocessing/Text_preprocessing.ipynb | 1311 +++++++++++++++-- notebooks/preprocessing/homework.ipynb | 30 +- 2 files changed, 1241 insertions(+), 100 deletions(-) diff --git a/notebooks/preprocessing/Text_preprocessing.ipynb b/notebooks/preprocessing/Text_preprocessing.ipynb index 243ce9d..489da0b 100644 --- a/notebooks/preprocessing/Text_preprocessing.ipynb +++ b/notebooks/preprocessing/Text_preprocessing.ipynb @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -151,9 +151,59 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Nltk также позволяет обучить свой токенизатор предложений под определенный корпус. Это делается не очень просто, но вот тут есть исчерпывающий туториал - https://nlpforhackers.io/splitting-text-into-sentences/" + "Nltk также позволяет обучить свой токенизатор предложений под определенный корпус.\n", + "Вот код для примера (изначально я взял его отсюда - https://nlpforhackers.io/splitting-text-into-sentences/ , но сайт больше не работает):\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 329, + "metadata": {}, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer\n", + " \n", + "trainer = PunktTrainer()\n", + "trainer.INCLUDE_ALL_COLLOCS = True\n", + "trainer.train(text)\n", + " \n", + "tokenizer = PunktSentenceTokenizer(trainer.get_params())\n", + " \n", + "# Test the tokenizer on a piece of text\n", + "sentences = \"Mr. James told me Dr. Brown is not available today. I will try tomorrow.\"\n", + " \n", + "print(tokenizer.tokenize(sentences))\n", + "# ['Mr. James told me Dr.', 'Brown is not available today.', 'I will try tomorrow.']\n", + " \n", + "# View the learned abbreviations\n", + "print(tokenizer._params.abbrev_types)\n", + "# set([...])\n", + " \n", + "# Here's how to debug every split decision\n", + "for decision in tokenizer.debug_decisions(sentences):\n", + " pprint(decision)\n", + " print('=' * 30)\n", + "\n", + "# adding abbreviations manually\n", + "tokenizer._params.abbrev_types.add('dr')\n", + " \n", + "print(tokenizer.tokenize(sentences))\n", + "# ['Mr. James told me Dr. Brown is not available today.', 'I will try tomorrow.']\n", + " \n", + "for decision in tokenizer.debug_decisions(sentences):\n", + " pprint(decision)\n", + " print('=' * 30)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -956,6 +1006,13 @@ "[token.text.lower() for token in list(razdel_tokenize(text))[:10]]" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -1086,8 +1143,7 @@ "Недостатки стемминга достаточно очевидные: \n", "1) с супплетивными формами или редкими окончаниями слова стемминг работать не умеет \n", "2) к одной основе могут приводится разные слова \n", - "3) к разным основам могут сводиться формы одного слова \n", - "4) приставки не отбрасываются" + "3) к разным основам могут сводиться формы одного слова " ] }, { @@ -1124,13 +1180,41 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting pymystem3\n", + " Using cached pymystem3-0.2.0-py3-none-any.whl (10 kB)\n", + "Requirement already satisfied: requests in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pymystem3) (2.32.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests->pymystem3) (3.3.2)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests->pymystem3) (2.2.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests->pymystem3) (3.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests->pymystem3) (2024.2.2)\n", + "Installing collected packages: pymystem3\n", + "Successfully installed pymystem3-0.2.0\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install pymystem3" + ] + }, + { + "cell_type": "code", + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "from pymystem3 import Mystem\n", "import os, json\n", - "mystem = Mystem()" + "mystem = Mystem(disambiguation=False)" ] }, { @@ -1216,6 +1300,13 @@ "words_analized[:10]" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 171, @@ -1415,7 +1506,44 @@ }, { "cell_type": "code", - "execution_count": 177, + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "from pymorphy2.analyzer import Parse, MorphAnalyzer\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "p = Parse(word='печь', tag='INFN,impf,tran', normal_form='печь', score=0.666666, \n", + " methods_stack=((dict, 'печь', 2352, 0)))" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "p._morph = MorphAnalyzer()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "# p.inflect(required_grammemes={\"gent\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -1425,32 +1553,19 @@ }, { "cell_type": "code", - "execution_count": 178, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "# основная функция - pymorphy.parse\n", - "words_analized = [morph.parse(token) for token in word_tokenize(text)]" + "words_analized = [morph.parse(token) for token in text.split()]" ] }, { "cell_type": "code", - "execution_count": 179, + "execution_count": 45, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Parse(word='печь', tag=OpencorporaTag('INFN,impf,tran'), normal_form='печь', score=0.666666, methods_stack=((, 'печь', 2352, 0),)),\n", - " Parse(word='печь', tag=OpencorporaTag('NOUN,inan,femn sing,nomn'), normal_form='печь', score=0.166666, methods_stack=((, 'печь', 2131, 0),)),\n", - " Parse(word='печь', tag=OpencorporaTag('NOUN,inan,femn sing,accs'), normal_form='печь', score=0.166666, methods_stack=((, 'печь', 2131, 3),))]" - ] - }, - "execution_count": 179, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "morph.parse(\"печь\")" ] @@ -1610,11 +1725,268 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 57, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: spacy in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (3.7.5)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (8.2.5)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (1.0.5)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (2.7.2)\n", + "Requirement already satisfied: jinja2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (3.1.4)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (3.4.1)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (2.0.10)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (1.1.3)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (2.32.3)\n", + "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (0.12.3)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (2.4.8)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (3.0.9)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (24.0)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (3.0.12)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (4.66.4)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (1.0.10)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (2.0.8)\n", + "Requirement already satisfied: numpy>=1.19.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (1.26.4)\n", + "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (0.4.1)\n", + "Requirement already satisfied: setuptools in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy) (65.5.0)\n", + "Requirement already satisfied: language-data>=1.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from langcodes<4.0.0,>=3.2.0->spacy) (1.2.0)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.12.0)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.18.3 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (2.18.3)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.2.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.3.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2024.2.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.7)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.1.5)\n", + "Requirement already satisfied: rich>=10.11.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy) (13.7.1)\n", + "Requirement already satisfied: shellingham>=1.3.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy) (1.5.4)\n", + "Requirement already satisfied: click>=8.0.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy) (8.1.7)\n", + "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy) (0.19.0)\n", + "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy) (7.0.4)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from jinja2->spacy) (2.1.5)\n", + "Requirement already satisfied: marisa-trie>=0.7.7 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy) (1.2.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (2.18.0)\n", + "Requirement already satisfied: wrapt in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy) (1.16.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (0.1.2)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Collecting en-core-web-sm==3.7.1\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from en-core-web-sm==3.7.1) (3.7.5)\n", + "Requirement already satisfied: setuptools in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (65.5.0)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (24.0)\n", + "Requirement already satisfied: jinja2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.4)\n", + "Requirement already satisfied: numpy>=1.19.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.4)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4.1)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.5)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)\n", + "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.4.1)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.66.4)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.3)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.7.2)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)\n", + "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.12.3)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.32.3)\n", + "Requirement already satisfied: language-data>=1.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.2.0)\n", + "Requirement already satisfied: pydantic-core==2.18.3 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.18.3)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.0)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.12.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2024.2.2)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.2.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.7)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.5)\n", + "Requirement already satisfied: rich>=10.11.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (13.7.1)\n", + "Requirement already satisfied: click>=8.0.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)\n", + "Requirement already satisfied: shellingham>=1.3.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.5.4)\n", + "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (7.0.4)\n", + "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.19.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.5)\n", + "Requirement already satisfied: marisa-trie>=0.7.7 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.2.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.18.0)\n", + "Requirement already satisfied: wrapt in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.16.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.2)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('en_core_web_sm')\n", + "Collecting de-core-news-sm==3.7.0\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from de-core-news-sm==3.7.0) (3.7.5)\n", + "Requirement already satisfied: setuptools in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (65.5.0)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (2.32.3)\n", + "Requirement already satisfied: numpy>=1.19.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (1.26.4)\n", + "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (0.12.3)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (2.7.2)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (2.4.8)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (3.0.12)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (3.0.9)\n", + "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (0.4.1)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (2.0.10)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (1.0.10)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (1.0.5)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (2.0.8)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (24.0)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (8.2.5)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (3.4.1)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (4.66.4)\n", + "Requirement already satisfied: jinja2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (3.1.4)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (1.1.3)\n", + "Requirement already satisfied: language-data>=1.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (1.2.0)\n", + "Requirement already satisfied: pydantic-core==2.18.3 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (2.18.3)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (0.7.0)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (4.12.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (2024.2.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (2.2.1)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (0.1.5)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (0.7.11)\n", + "Requirement already satisfied: click>=8.0.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (8.1.7)\n", + "Requirement already satisfied: rich>=10.11.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (13.7.1)\n", + "Requirement already satisfied: shellingham>=1.3.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (1.5.4)\n", + "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (0.19.0)\n", + "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (7.0.4)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from jinja2->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (2.1.5)\n", + "Requirement already satisfied: marisa-trie>=0.7.7 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (1.2.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (2.18.0)\n", + "Requirement already satisfied: wrapt in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (1.16.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->de-core-news-sm==3.7.0) (0.1.2)\n", + "Installing collected packages: de-core-news-sm\n", + "Successfully installed de-core-news-sm-3.7.0\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('de_core_news_sm')\n", + "Collecting ru-core-news-sm==3.7.0\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.3/15.3 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting pymorphy3>=1.0.0\n", + " Downloading pymorphy3-2.0.2-py3-none-any.whl (53 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.8/53.8 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from ru-core-news-sm==3.7.0) (3.7.5)\n", + "Collecting pymorphy3-dicts-ru\n", + " Using cached pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)\n", + "Requirement already satisfied: dawg-python>=0.7.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pymorphy3>=1.0.0->ru-core-news-sm==3.7.0) (0.7.2)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.4.8)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.0.10)\n", + "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.12.3)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (8.2.5)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.7.2)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.0.10)\n", + "Requirement already satisfied: setuptools in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (65.5.0)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.4.1)\n", + "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.4.1)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (4.66.4)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.32.3)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (24.0)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.0.9)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.1.3)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.0.8)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.0.5)\n", + "Requirement already satisfied: jinja2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.1.4)\n", + "Requirement already satisfied: numpy>=1.19.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.26.4)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.0.12)\n", + "Requirement already satisfied: language-data>=1.2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.2.0)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.18.3 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.18.3)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (4.12.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.3.2)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.2.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2024.2.2)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.1.5)\n", + "Requirement already satisfied: click>=8.0.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (8.1.7)\n", + "Requirement already satisfied: shellingham>=1.3.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.5.4)\n", + "Requirement already satisfied: rich>=10.11.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (13.7.1)\n", + "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (7.0.4)\n", + "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.19.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from jinja2->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.1.5)\n", + "Requirement already satisfied: marisa-trie>=0.7.7 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.18.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.0.0)\n", + "Requirement already satisfied: wrapt in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.16.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /Users/mnefedov/.pyenv/versions/3.10.9/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.1.2)\n", + "Installing collected packages: pymorphy3-dicts-ru, pymorphy3, ru-core-news-sm\n", + "Successfully installed pymorphy3-2.0.2 pymorphy3-dicts-ru-2.4.417150.4580142 ru-core-news-sm-3.7.0\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('ru_core_news_sm')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[?25hCollecting pymorphy3-dicts-ru\n", + " Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.4/8.4 MB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting dawg-python>=0.7.1\n", + " Using cached DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)\n", + "Collecting docopt-ng>=0.6\n", + " Downloading docopt_ng-0.9.0-py3-none-any.whl (16 kB)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.0.12)\n", + "Requirement already satisfied: numpy>=1.19.0 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.23.3)\n", + "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.9.0)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.0.10)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.4.2)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.0.8)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.4.8)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (4.66.1)\n", + "Requirement already satisfied: setuptools in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (63.2.0)\n", + "Requirement already satisfied: jinja2 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.11.3)\n", + "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.3.4)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.1.2)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.31.0)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.0.9)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (1.0.5)\n", + "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (6.3.0)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.0.10)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (23.1)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (8.2.1)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.3.0)\n", + "Requirement already satisfied: pydantic-core==2.10.1 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.10.1)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (4.8.0)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.6.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2023.7.22)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.0.5)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (3.2.0)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.1.3)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (7.1.2)\n", + "Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (0.16.0)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /Users/mnefedov/miniforge3/lib/python3.9/site-packages (from jinja2->spacy<3.8.0,>=3.7.0->ru-core-news-sm==3.7.0) (2.0.1)\n", + "Installing collected packages: pymorphy3-dicts-ru, dawg-python, docopt-ng, pymorphy3, ru-core-news-sm\n", + "Successfully installed dawg-python-0.7.2 docopt-ng-0.9.0 pymorphy3-1.2.1 pymorphy3-dicts-ru-2.4.417150.4580142 ru-core-news-sm-3.7.0\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('ru_core_news_sm')\n" + ] + } + ], "source": [ "!pip install spacy\n", "!python -m spacy download en_core_web_sm\n", @@ -1624,14 +1996,28 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 56, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "OSError", + "evalue": "[E050] Can't find model 'ru_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[56], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# загружаем пайплайн для английского языка\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mspacy\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m nlp \u001b[38;5;241m=\u001b[39m \u001b[43mspacy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mru_core_news_sm\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.pyenv/versions/3.10.9/lib/python3.10/site-packages/spacy/__init__.py:51\u001b[0m, in \u001b[0;36mload\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload\u001b[39m(\n\u001b[1;32m 28\u001b[0m name: Union[\u001b[38;5;28mstr\u001b[39m, Path],\n\u001b[1;32m 29\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 34\u001b[0m config: Union[Dict[\u001b[38;5;28mstr\u001b[39m, Any], Config] \u001b[38;5;241m=\u001b[39m util\u001b[38;5;241m.\u001b[39mSimpleFrozenDict(),\n\u001b[1;32m 35\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Language:\n\u001b[1;32m 36\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load a spaCy model from an installed package or a local path.\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03m name (str): Package name or model path.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;124;03m RETURNS (Language): The loaded nlp object.\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mutil\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 52\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 53\u001b[0m \u001b[43m \u001b[49m\u001b[43mvocab\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvocab\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 54\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[43m \u001b[49m\u001b[43menable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[43m \u001b[49m\u001b[43mexclude\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexclude\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.pyenv/versions/3.10.9/lib/python3.10/site-packages/spacy/util.py:472\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m OLD_MODEL_SHORTCUTS:\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE941\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname, full\u001b[38;5;241m=\u001b[39mOLD_MODEL_SHORTCUTS[name])) \u001b[38;5;66;03m# type: ignore[index]\u001b[39;00m\n\u001b[0;32m--> 472\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE050\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname))\n", + "\u001b[0;31mOSError\u001b[0m: [E050] Can't find model 'ru_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory." + ] + } + ], "source": [ "# загружаем пайплайн для английского языка\n", "import spacy\n", "\n", - "nlp = spacy.load(\"en_core_web_sm\")\n" + "nlp = spacy.load(\"ru_core_news_sm\")\n" ] }, { @@ -2062,16 +2448,30 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 55, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "OSError", + "evalue": "[E050] Can't find model 'ru_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[55], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m nlp \u001b[38;5;241m=\u001b[39m \u001b[43mspacy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mru_core_news_sm\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.pyenv/versions/3.10.9/lib/python3.10/site-packages/spacy/__init__.py:51\u001b[0m, in \u001b[0;36mload\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload\u001b[39m(\n\u001b[1;32m 28\u001b[0m name: Union[\u001b[38;5;28mstr\u001b[39m, Path],\n\u001b[1;32m 29\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 34\u001b[0m config: Union[Dict[\u001b[38;5;28mstr\u001b[39m, Any], Config] \u001b[38;5;241m=\u001b[39m util\u001b[38;5;241m.\u001b[39mSimpleFrozenDict(),\n\u001b[1;32m 35\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Language:\n\u001b[1;32m 36\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load a spaCy model from an installed package or a local path.\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03m name (str): Package name or model path.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;124;03m RETURNS (Language): The loaded nlp object.\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mutil\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 52\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 53\u001b[0m \u001b[43m \u001b[49m\u001b[43mvocab\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvocab\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 54\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[43m \u001b[49m\u001b[43menable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[43m \u001b[49m\u001b[43mexclude\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexclude\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.pyenv/versions/3.10.9/lib/python3.10/site-packages/spacy/util.py:472\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m OLD_MODEL_SHORTCUTS:\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE941\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname, full\u001b[38;5;241m=\u001b[39mOLD_MODEL_SHORTCUTS[name])) \u001b[38;5;66;03m# type: ignore[index]\u001b[39;00m\n\u001b[0;32m--> 472\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE050\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname))\n", + "\u001b[0;31mOSError\u001b[0m: [E050] Can't find model 'ru_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory." + ] + } + ], "source": [ "nlp = spacy.load(\"ru_core_news_sm\")" ] }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -2084,58 +2484,46 @@ }, { "cell_type": "code", - "execution_count": 119, - "metadata": {}, - "outputs": [], - "source": [ - "doc = nlp(text)\n", - "\n", - "for sent in doc.sents: # достаем предложения\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 117, + "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ДАННОЕ данное NOUN\n", - "СООБЩЕНИЕ сообщение PROPN\n", + "ДАННОЕ ДАННОЕ PROPN\n", + "СООБЩЕНИЕ СООБЩЕНИЕ PROPN\n", "( ( PUNCT\n", - "МАТЕРИАЛ материал PROPN\n", + "МАТЕРИАЛ МАТЕРИАЛ PROPN\n", ") ) PUNCT\n", - "СОЗДАНО создано PROPN\n", - "И и PROPN\n", + "СОЗДАНО СОЗДАНО PROPN\n", + "И И PROPN\n", "( ( PUNCT\n", - "ИЛИ или PROPN\n", + "ИЛИ ИЛИ PROPN\n", ") ) PUNCT\n", - "РАСПРОСТРАНЕНО распространено PROPN\n", - "ИНОСТРАННЫМ иностранным PROPN\n", - "СРЕДСТВОМ средством PROPN\n", - "МАССОВОЙ массовой PROPN\n", - "ИНФОРМАЦИИ информации PROPN\n", + "РАСПРОСТРАНЕНО РАСПРОСТРАНЕНО PROPN\n", + "ИНОСТРАННЫМ ИНОСТРАННЫМ PROPN\n", + "СРЕДСТВОМ СРЕДСТВОМ PROPN\n", + "МАССОВОЙ МАССОВОЙ VERB\n", + "ИНФОРМАЦИИ ИНФОРМАЦИИ PROPN\n", ", , PUNCT\n", - "ВЫПОЛНЯЮЩИМ выполняющим PROPN\n", - "ФУНКЦИИ функция PROPN\n", - "ИНОСТРАННОГО иностранного PROPN\n", - "АГЕНТА агента PROPN\n", + "ВЫПОЛНЯЮЩИМ ВЫПОЛНЯЮЩИМ PROPN\n", + "ФУНКЦИИ ФУНКЦИИ NOUN\n", + "ИНОСТРАННОГО иностранного NOUN\n", + "АГЕНТА агента NOUN\n", ", , PUNCT\n", - "И и CCONJ\n", + "И И PROPN\n", "( ( PUNCT\n", - "ИЛИ или PROPN\n", + "ИЛИ ИЛИ PROPN\n", ") ) PUNCT\n", - "РОССИЙСКИМ российским PROPN\n", - "ЮРИДИЧЕСКИМ юридическим PROPN\n", - "ЛИЦОМ лицом PROPN\n", + "РОССИЙСКИМ РОССИЙСКИМ VERB\n", + "ЮРИДИЧЕСКИМ ЮРИДИЧЕСКИМ PROPN\n", + "ЛИЦОМ ЛИЦОМ PROPN\n", ", , PUNCT\n", - "ВЫПОЛНЯЮЩИМ выполняющим PROPN\n", - "ФУНКЦИИ функция PROPN\n", - "ИНОСТРАННОГО иностранного PROPN\n", - "АГЕНТА агента PROPN\n" + "ВЫПОЛНЯЮЩИМ ВЫПОЛНЯЮЩИМ PROPN\n", + "ФУНКЦИИ ФУНКЦИИ NOUN\n", + "ИНОСТРАННОГО иностранного NOUN\n", + "АГЕНТА агента NOUN\n" ] } ], @@ -2149,26 +2537,785 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 61, "metadata": {}, "outputs": [ { - "ename": "NotImplementedError", - "evalue": "[E894] The 'noun_chunks' syntax iterator is not implemented for language 'ru'.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Noun phrases:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnoun_chunks\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Noun phrases:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnoun_chunks\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/.pyenv/versions/3.6.5/lib/python3.6/site-packages/spacy/tokens/doc.pyx\u001b[0m in \u001b[0;36mnoun_chunks\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mNotImplementedError\u001b[0m: [E894] The 'noun_chunks' syntax iterator is not implemented for language 'ru'." + "name": "stdout", + "output_type": "stream", + "text": [ + "Noun phrases: ['ДАННОЕ СООБЩЕНИЕ', 'МАТЕРИАЛ', 'ИНОСТРАННЫМ СРЕДСТВОМ', 'ВЫПОЛНЯЮЩИМ ФУНКЦИИ ИНОСТРАННОГО АГЕНТА', 'И', 'ИЛИ', 'ЮРИДИЧЕСКИМ', 'ЛИЦОМ', 'ВЫПОЛНЯЮЩИМ ФУНКЦИИ ИНОСТРАННОГО АГЕНТА']\n" ] } ], "source": [ "print(\"Noun phrases:\", [chunk.text for chunk in doc.noun_chunks])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Удаление дубликатов\n", + "\n", + "В многих практических задачах требуется искать совпадающие тексты. Например, для определения плагиата, или для того, чтобы отобрать уникальные тексты для обучения. Сама по себе эта задача - тривиальная. Нужно просто сравнить тексты между собой. Даже если нужно найти не просто точно совпадающие тексты (дупликаты), а еще и похожие тексты. В этом случае можно заменить прямое сравнение какой-то метрикой (например, мерой Жаккара между множествами нграммов).\n", + "\n", + "Но и то и другое становится проблемой, когда количество текстов очень большое. Чтобы найти дубликаты нужно сравнить все тексты со всеми и это слишком много вычислений даже для самых простых методов.\n", + "\n", + "Для такой задачи стандартно применяется класс алгоритмов, которые называются Local Sensitive Hashing. Давайте попробуем разобраться как это работает.\n", + "\n", + "Тут три важных компоненты: шинглы, minhash и LSH (это все в целом называется LSH но и последний шаг тоже так называется, что немного запутанно)\n", + "\n", + "1) Шинглы - это просто куски текстов/документов какой-то длины. Мы бы скорее назвали это символьными нграммами.\n", + "2) Minhash - это алгоритм, который позволяет рассчитать приблизительное расстояние Жаккара между множествами шинглов двух документов (https://en.wikipedia.org/wiki/MinHash). Это не единственный алгоритм, еще есть например SimHash (https://en.wikipedia.org/wiki/SimHash)\n", + "3) LSH шаг это еще одна оптимизация для нахождения кандидатов в дубликаты\n", + "\n", + "Пройдемся по каждому из шагов и напишем простую реализацию на питоне" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://media.springernature.com/lw1200/springer-static/image/art%3A10.1007%2Fs10660-021-09472-1/MediaObjects/10660_2021_9472_Fig2_HTML.png)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Сгенерировать шинглы очень просто. Единственный важный момент что в итоге нам нужны только уникальные шинглы (то есть множества а не списки)\n", + "\n", + "Длина шингла это параметр который можно настраивать" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "def get_shingles(text, k=5):\n", + " \"\"\"генерирует список шинглов из строки\"\"\"\n", + " shingles = set()\n", + " for i in range(len(text) - k + 1):\n", + " shingle = text[i:i + k]\n", + " shingles.add(shingle)\n", + " return shingles" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{' для тес',\n", + " ' строки ',\n", + " 'для тест',\n", + " 'ер строк',\n", + " 'и для те',\n", + " 'имер стр',\n", + " 'ки для т',\n", + " 'ля теста',\n", + " 'мер стро',\n", + " 'оки для ',\n", + " 'пример с',\n", + " 'р строки',\n", + " 'ример ст',\n", + " 'роки для',\n", + " 'строки д',\n", + " 'троки дл'}" + ] + }, + "execution_count": 200, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_shingles(\"пример строки для теста\", 8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "MinHash самая сложная часть этого алгоритма. \n", + "\n", + "Можете посмотреть одно или несколько из этих видео, чтобы разобрать поглубжe: \n", + "https://www.youtube.com/watch?v=e_SBq3s20M8 \n", + "https://www.youtube.com/watch?v=96WOGPUgMfw \n", + "https://www.youtube.com/watch?v=R12splIFMOs \n", + "https://www.youtube.com/watch?v=bQAYY8INBxg \n", + "\n", + "\n", + "\n", + "Во-первых, нужно заменить шинглы на числа с помощью хеширования. Хэширование это маппинг элементов (например строк) в позиции в какой-то таблице. Позиция в таблице и есть хэш, обычно это просто число. Маппинг происходит с помощью какой-то функции и поэтому с помощью хеширования удобно проверять наличие - достаточно расчитать хэш объекта и посмотреть есть ли уже такой объект в таблице, сравнивать со всеми существующими объектами не нужно\n", + "\n", + "Словари и множества в питоне работают на хеш таблицах и поэтому проверить есть ли какой-то элемент в множестве гораздо быстрее чем в списке " + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "d = {str(i) for i in range(1000000)}\n", + "l = [str(i) for i in range(1000000)]" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3 µs, sys: 1 µs, total: 4 µs\n", + "Wall time: 8.11 µs\n" + ] + }, + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\"8732323\" in d" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 18.3 ms, sys: 354 µs, total: 18.7 ms\n", + "Wall time: 19.7 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\"873232312\" in l" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Функций для хеширования очень много и это своя отдельная тема. Для минхэша нам пондобится генерировать много хэш функций, это можно сделать вот так - мы используем один алгоритм хэширования, но к строкам добавляем какой-то индекс, что меняет результат" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "metadata": {}, + "outputs": [], + "source": [ + "import hashlib\n", + "from collections import defaultdict\n", + "\n", + "\n", + "def hash_string(s):\n", + " \"\"\"хеширует строку и возвращает число\"\"\"\n", + " return int(hashlib.md5(s.encode('utf8')).hexdigest(), 16) \n", + "\n", + "def generate_hash_functions(k):\n", + " \"\"\"генерирует k хеш-функций добавляя индекс к строке\"\"\"\n", + "\n", + " functions = []\n", + " for i in range(k):\n", + " functions.append(lambda x, i=i: hash_string(x + str(i)))\n", + " return functions" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [], + "source": [ + "# import sympy\n", + "\n", + "# sympy.isprime(5)\n", + "\n", + "# list(sympy.primerange(100000, 1100000))" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 52 µs, sys: 2 µs, total: 54 µs\n", + "Wall time: 58.2 µs\n" + ] + }, + { + "data": { + "text/plain": [ + "17236" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "int(hashlib.md5(texts[220].encode('utf-8')).hexdigest(), 16) % 105337" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9238" + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# если вызвать эту функцию на одной строке несколько раз то результат будет одинаковый\n", + "hash_string(\"пример\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Сделаем две функции чтобы генерировать разные результаты для одной строки " + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [], + "source": [ + "functions = generate_hash_functions(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "69815" + ] + }, + "execution_count": 167, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "functions[0](\"пример\")" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "49600" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "functions[1](\"пример\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Теперь самая сложная часть. Вместо хэшей нужно сгенерировать их сигнатуры, которые будут приближать меру Жаккара между множествами шинглов\n", + "\n", + "Этот алгоритм обычно объясняют вот такой схемой. \n", + "\n", + "![](https://miro.medium.com/v2/resize:fit:913/1*4HbYE4-7DRfri7cfN0tN0A.png)\n", + "\n", + "В центре матрица шинглы на документы (input matrix). Такие же матрицы мы будем еще раз много строить для представления текстов мешком слов. Это разреженная матрица где 1 показывает это слово есть в документе, а 0 что нет. Каждый документ представляется вектором, который равен размеру всего словаря (то есть количеству уникальных слов/токенов/шинглов во всех документах). \n", + "\n", + "Такие вектора можно использовать для расчета метрики Жаккара (или для косиносной близости), но это все еще слишком дорого, потому что нужно сравнивать все со всем. Метрика Жаккара между двумя такими векторами равна количеству совпадений единиц на одной позиции, поделить на количество ненулевых позиций в обоих векторах. Если в конкретной позиции у обоих векторов стоят 1, значит слово/шингл есть в обоих документах. А если 1 стоит только в одном векторе, а в другом 0 - то значит пересечения между ними нет. Нули тут никак не учитываются. Это то же самое, что рассчитать пересечение множеств и поделить на объединение, только в векторном формате.\n", + "\n", + "MinHash это такое преобразование этой матрицы шинглы на документы, которое приближает меру жаккара. Преобразование заключается в том, что генерируются перестановки для этой матрицы и в каждой перестановке для каждого документа находится первая ненулевая позиция и ее индекс записывается в новую матрицу (сигнатуру). Перестановки тут обозначены цветными векторами. Если вы посмотрите на голубой вектор, то первой теперь является строчка, которая была 6-ой в изначальной матрице. В ней у первого документа уже стоит 1, поэтому для первого документа значение сигнатуры - 1 (матрица справа). Для второго документа стоит 0, поэтому нужно смотреть следующий вектор - 4-ый в изначальной матрице. В нем уже стоит 1 во втором документе, поэтому значение сигнатуры будет 2. У третьего документа 1 потому что в первом векторе уже стоит 1, а в 4 такая же ситуация как и во втором. (Проверьте что вы понимаете как получились желтая и коричневая сигнатура)\n", + "\n", + "Теперь сигнатуры документов можно использовать для расчета Жаккара так же как и обычные вектора. Если позиции сходятся, то документы схожи. Это гораздо эффективнее потому что сигнатуры гораздо меньше изначальных векторов и большую часть вычисление делать теперь не нужно.\n", + "\n", + "Каждая сигнатура может не точно передавать общую близость изначальных векторов, но если сделать много таких перестановок, то схожесть по сигнатурам будет очень близка схожести по изначальным векторам.\n", + "Такой подход приближает метрику Жаккара, потому что если в обоих документах много совпадений шинглов (1 и 1 в обоих на одной позиции), то они часто будут генерировать один и тот же индекс для сигнатуры. Чем больше несовпадений шинглов в документах (0 в одном и 1 в другом), тем чаще в сигнатуре будут разные индексы и соответственно метрика будет низкая. Получается даже так, что вероятность при случайной перестановке получить 1 и 1 в обоих векторах равна метрике Жаккара между изначальными документами! (количество случаев где оба значения 1, поделить на количество ненулевых позиции в обоих векторах). Это только вероятность и каждая отдельная перестановка может давать другой результат, но повторяя их много раз, в среднем мы получим исходное расстояние. \n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В коде это выглядит немного по-другому, но принцип точно такой же! Просто перестановки создавать сложно и делать полную матрицу тоже не нужно. Можно просто вместо перестановок делать разные хеш функции (хеш функция это просто маппинг в число, то есть в итоге мы получаем позицию). Если применить хеш функцию ко всем шинглам в документе и взять минимальное значение, то это то же самое что сделать перестановку и взять первое ненулевое значение вектора выше!\n", + "\n", + "Поэтому код очень простой." + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_minhash_signature(shingles, hash_funcs):\n", + " \"\"\"вычисляет minhash-сигнатуру для списка шинглов\"\"\"\n", + " signature = []\n", + " for hash_func in hash_funcs:\n", + " min_hash = min(hash_func(shingle) for shingle in shingles)\n", + " signature.append(min_hash)\n", + " return signature" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[4888, 8891, 9010, 8604, 401]" + ] + }, + "execution_count": 170, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# числа тут гораздо больше чем в примере\n", + "# просто неудобно показывать такие большие индексы для примера\n", + "# но суть от этого не меняется - это просто числа индексы\n", + "shingles = get_shingles(\"пример строки для теста\", 3)\n", + "hash_funcs = generate_hash_functions(5)\n", + "compute_minhash_signature(shingles, hash_funcs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Последний элемент LSH просто разрезает сигнатуры на кусочки и группирует документы по совпадению этих кусочков. Сгруппированные документы уже являются кандидатами для расчета нормальной близости, потому что они скорее всего будут дубликатами." + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [], + "source": [ + "def lsh(signatures, bands):\n", + " \"\"\"Разрезает сигнатуры на куски (bands), и группирует индексы сигнатур по совпадению кусков\"\"\"\n", + " buckets = defaultdict(list)\n", + " band_length = len(signatures[0]) // bands\n", + " \n", + " for idx, sig in tqdm(enumerate(signatures)):\n", + " for b in range(0, bands, band_length):\n", + " start = b\n", + " end = start + band_length\n", + " band = tuple(sig[start:end])\n", + " buckets[band].append(idx)\n", + " return buckets" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[47768649540959415773458792523333048870,\n", + " 7684655364199927409860563777974756678,\n", + " 15083700868336705187602456618714281847,\n", + " 23751230482080805943145997883796637502,\n", + " 6444901249487694733329551606909628849,\n", + " 3440871066400609071968054431171020532]" + ] + }, + "execution_count": 172, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "signature_1" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "defaultdict(list,\n", + " {(4888, 8891, 9010): [0],\n", + " (9958, 824, 727): [1],\n", + " (3244, 8891, 9010): [2]})" + ] + }, + "execution_count": 173, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hash_funcs = generate_hash_functions(6)\n", + "\n", + "shingles_1 = get_shingles(\"пример строки для теста\", 3)\n", + "signature_1 = compute_minhash_signature(shingles_1, hash_funcs)\n", + "\n", + "shingles_2 = get_shingles(\"совершенно другой текст\", 3)\n", + "signature_2 = compute_minhash_signature(shingles_2, hash_funcs)\n", + "\n", + "shingles_3 = get_shingles(\"пример похожей строки для теста\", 3)\n", + "signature_3 = compute_minhash_signature(shingles_3, hash_funcs)\n", + "\n", + "lsh([signature_1, signature_2, signature_3], 2)\n", + "\n", + "# 0, 2 сгруппировались по совпадению кусочков сигнатур длинной 3" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "metadata": {}, + "outputs": [], + "source": [ + "def find_similar_strings(strings_list, k=5, num_hashes=100, bands=20):\n", + " \"\"\"Finds similar strings using MinHash and LSH.\"\"\"\n", + " hash_funcs = generate_hash_functions(num_hashes)\n", + " signatures = []\n", + " shingles_list = []\n", + "\n", + " # каждый текст обрабатывается отдельно\n", + " # находятся шинглы и рассчитываются сигнатуры \n", + " for string in tqdm(strings_list):\n", + " shingles = get_shingles(string, k)\n", + " shingles_list.append(shingles)\n", + " signature = compute_minhash_signature(shingles, hash_funcs)\n", + " signatures.append(signature)\n", + "\n", + " # вычисляются кандидаты по кускам сигнатур\n", + " buckets = lsh(signatures, bands)\n", + " candidates = set()\n", + " for bucket in buckets.values():\n", + " if len(bucket) > 1:\n", + " for i in bucket:\n", + " for j in bucket:\n", + " if i < j:\n", + " candidates.add((i, j))\n", + "\n", + "\n", + " return candidates\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 320, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dc55c82d64034d639dd077df32dc1acc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/6 [00:00 минхэш - lsh) и рассчитайте реальную меру Жаккара между полученными кандидатами. Настройте параметры k, num_hash_functions, bands так чтобы результаты получались адекватные (мера Жаккара хотя бы больше нуля). \n", + "\n", + "(Можете взять 500-1000 текстов если весь корпус обрабатывается слишком долго)" ] }, { @@ -166,7 +160,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.10.9" } }, "nbformat": 4,