Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for llm in spacy init config #12820

Draft
wants to merge 8 commits into
base: v4
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 61 additions & 2 deletions spacy/cli/init_config.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import re
from enum import Enum
from pathlib import Path
from typing import List, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple

import srsly
from jinja2 import Template
from thinc.api import Config
from wasabi import Printer, diff_strings

from .. import util
from ..errors import Errors
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList
from ..util import SimpleFrozenList, registry
from ._util import (
COMMAND,
Arg,
Expand Down Expand Up @@ -40,6 +41,8 @@ class InitValues:

lang = "en"
pipeline = SimpleFrozenList(["tagger", "parser", "ner"])
llm_task: Optional[str] = None
llm_model: Optional[str] = None
optimize = Optimizations.efficiency
gpu = False
pretraining = False
Expand All @@ -52,6 +55,8 @@ def init_config_cli(
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
llm_task: str = Opt(InitValues.llm_task, "--llm.task", help="Name of task for LLM pipeline components"),
llm_model: str = Opt(InitValues.llm_model, "--llm.model", help="Name of model for LLM pipeline components"),
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
Expand All @@ -77,6 +82,8 @@ def init_config_cli(
config = init_config(
lang=lang,
pipeline=pipeline,
llm_model=llm_model,
llm_task=llm_task,
optimize=optimize.value,
gpu=gpu,
pretraining=pretraining,
Expand Down Expand Up @@ -157,6 +164,8 @@ def init_config(
*,
lang: str = InitValues.lang,
pipeline: List[str] = InitValues.pipeline,
llm_model: Optional[str] = InitValues.llm_model,
llm_task: Optional[str] = InitValues.llm_task,
optimize: str = InitValues.optimize,
gpu: bool = InitValues.gpu,
pretraining: bool = InitValues.pretraining,
Expand All @@ -165,8 +174,57 @@ def init_config(
msg = Printer(no_print=silent)
with TEMPLATE_PATH.open("r") as f:
template = Template(f.read())

# Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]

# Verify LLM arguments are consistent, if at least one `llm` component has been specified.
llm_spec: Dict[str, Dict[str, Any]] = {}
if "llm" in pipeline:
try:
import spacy_llm
except ImportError as ex:
raise ValueError(Errors.E1055) from ex

if llm_model is None:
msg.fail(
"Option `--llm.model` must be set if `llm` component is in pipeline.",
exits=1,
)
if llm_task is None:
msg.fail(
"Option `--llm.task` must be set if `llm` component is in pipeline.",
exits=1,
)

# Select registry handles for model(s) and task(s). Raise if no match found.
llm_spec = {
spec_type: {
"arg": llm_model if spec_type == "model" else llm_task,
"matched_reg_handle": None,
"reg_handles": getattr(registry, f"llm_{spec_type}s").get_all(),
}
for spec_type in ("model", "task")
}

for spec_type, spec in llm_spec.items():
valid_values = set()
user_value = spec["arg"].lower().replace(".", "-")
for reg_handle in spec["reg_handles"]:
reg_name = reg_handle.split(".")[1]
valid_values.add(reg_name)
if reg_name.lower() == user_value:
spec["matched_reg_handle"] = reg_handle
break
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there are 2 versions of a model/task, is this code guaranteed to give you the latest version?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't - I just checked - it produces spacy.NER.v1 instead of spacy.NER.v2 when given "ner"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it just grabs the first occurence as of now. I left that as TBD since it wasn't clear how we want to go forward.


if not spec["matched_reg_handle"]:
arg = spec["arg"]
msg.fail(
f"Couldn't find a matching registration handle for {spec_type} '{arg}'. "
f"Valid options are: {valid_values}",
exits=1,
)

defaults = RECOMMENDATIONS["__default__"]
reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict()
variables = {
Expand All @@ -175,6 +233,7 @@ def init_config(
"optimize": optimize,
"hardware": "gpu" if gpu else "cpu",
"transformer_data": reco["transformer"],
"llm_spec": {key: llm_spec[key]["matched_reg_handle"] for key in llm_spec},
"word_vectors": reco["word_vectors"],
"has_letters": reco["has_letters"],
}
Expand Down
25 changes: 24 additions & 1 deletion spacy/cli/templates/quickstart_training.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer", "llm"] -%}
Copy link
Contributor

@adrianeboyd adrianeboyd Jul 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

llm shouldn't be in this list of components with listeners, which will cause tok2vec or transformer to be added (depending on the rest of the template, of course).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. I vaguely remember that the LLM block was not output properly if "llm" was not in listener_components. I'll check.

[paths]
train = null
dev = null
Expand Down Expand Up @@ -328,6 +328,18 @@ grad_factor = 1.0
{%- endif %}
{%- endif %}

{% if "llm" in components -%}
[components.llm]
factory = "llm"

[components.llm.model]
@llm_models = "{{ llm_spec['model'] }}"

[components.llm.task]
@llm_tasks = "{{ llm_spec['task'] }}"
{% endif -%}


{# NON-TRANSFORMER PIPELINE #}
{% else -%}
{% if "tok2vec" in full_pipeline -%}
Expand Down Expand Up @@ -585,6 +597,17 @@ no_output_layer = false
{%- endif %}
{% endif %}

{% if "llm" in components -%}
[components.llm]
factory = "llm"

[components.llm.model]
@llm_models = "{{ llm_spec['model'] }}"

[components.llm.task]
@llm_tasks = "{{ llm_spec['task'] }}"
{% endif -%}

Comment on lines +600 to +610
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think (depending on exactly how this gets designed in the end), it should be possible to only have this block once because it does not depend on the listener setup.

{% for pipe in components %}
{% if pipe not in listener_components %}
{# Other components defined by the user: we just assume they're factories #}
Expand Down
2 changes: 2 additions & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -981,6 +981,8 @@ class Errors(metaclass=ErrorsWithCodes):
" 'min_length': {min_length}, 'max_length': {max_length}")
E1054 = ("The text, including whitespace, must match between reference and "
"predicted docs when training {component}.")
E1055 = ("To use the `llm` component, `spacy-llm` needs to be installed. `spacy-llm` was not found in your "
"environment, install it with `pip install spacy-llm`.")


# Deprecated model shortcuts, only used in errors and warnings
Expand Down
21 changes: 20 additions & 1 deletion spacy/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,6 @@ def test_parse_cli_overrides():
"pipeline",
[
["tagger", "parser", "ner"],
[],
["ner", "textcat", "sentencizer"],
["morphologizer", "spancat", "entity_linker"],
["spancat_singlelabel", "textcat_multilabel"],
Expand All @@ -651,6 +650,26 @@ def test_init_config(lang, pipeline, optimize, pretraining):
load_model_from_config(config, auto_fill=True)


@pytest.mark.parametrize("pipeline", [["llm"]])
@pytest.mark.parametrize("llm_model", ["noop"])
@pytest.mark.parametrize("llm_task", ["ner", "sentiment"])
def test_init_config_llm(pipeline, llm_model, llm_task):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this test should probably be in spacy-llm unless it's installed by default (similar to spacy-transformers, where I'm not sure this is tested at all, but where it could be tested).

Copy link
Member

@svlandeg svlandeg Jul 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO we should consider installing spacy-llm by default going forward. At some point we want to move it to the core codebase, but for now, having it installed by default + kept as separate repo has the advantages that we can still release updates more quickly with spacy-llm while not troubling users with an additional install command. The requirements are minimal.

config = init_config(
lang="en",
pipeline=pipeline,
llm_model=llm_model,
llm_task=llm_task,
optimize="accuracy",
pretraining=False,
gpu=False,
)
assert isinstance(config, Config)
assert len(config["components"]) == 1
assert "llm" in config["components"]

load_model_from_config(config, auto_fill=True)


def test_model_recommendations():
for lang, data in RECOMMENDATIONS.items():
assert RecommendationSchema(**data)
Expand Down