Skip to content
This repository has been archived by the owner on Feb 12, 2024. It is now read-only.

Commit

Permalink
Feature/change source (#29)
Browse files Browse the repository at this point in the history
* modify llama index source

* tweaks for improved gen

* t com

* Add prompt configs

* cleanUP

* cleanUP

* mod

* lint
  • Loading branch information
emrgnt-cmplxty authored Sep 25, 2023
1 parent 4db0701 commit c24cc1f
Show file tree
Hide file tree
Showing 34 changed files with 67,002 additions and 13 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.yaml linguist-generated=true
*.tex linguist-generated=true
*.csv linguist-generated=true
*.json linguist-generated=true
Expand Down
3 changes: 2 additions & 1 deletion .github/scripts/check_prompt_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
if __name__ == "__main__":
data_config = DataConfig(
os.path.join(
get_data_config_dir(), "textbooks_are_all_you_need/main.yaml"
get_data_config_dir(),
"textbooks_are_all_you_need_basic_split/main.yaml",
)
)
prompt_generator = PromptGenerator(
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,15 @@ poetry install -E <extra_name>
You can use SciPhi for dataset generation by executing the relevant `runner.py` file with various command-line arguments.

```bash
poetry run python sciphi/examples/data_generation/runner.py --provider_name=openai --model_name=gpt-4 --log_level=DEBUG --batch_size=1 --num_samples=1 --output_file_name=example_output.jsonl --example_config=textbooks_are_all_you_need
poetry run python sciphi/examples/basic_data_gen/runner.py --provider_name=openai --model_name=gpt-4 --log_level=DEBUG --batch_size=1 --num_samples=1 --output_file_name=example_output.jsonl --example_config=textbooks_are_all_you_need_basic_split
```

### Key Command-Line Arguments

- `--provider`: Which provider to use for completions (default: "openai").
- `--model_name`: The name of the model to load from the provider (default: "gpt-3.5-turbo").
- `--temperature`: Temperature parameter for the provided model (default: 0.7).
- `--example_config`: Which example configuration to use (default: "textbooks_are_all_you_need").
- `--example_config`: Which example configuration to use (default: "textbooks_are_all_you_need_basic_split").
- `--override_config_path`: Used to override the example configurations with custom config.
- `--num_samples`: Number of samples to generate (default: 1_024).
- `--output_dir`: File path to override the default output output file path with.
Expand All @@ -75,7 +75,7 @@ poetry run python sciphi/examples/data_generation/runner.py --provider_name=open
### Stock data configs

- `evol_instruct` - A config for replicating the EvolInstruct dataset
- `textbooks_are_all_you_need` - A config for replicating the Python textbook data from Textbooks Are All You Need [2]
- `textbooks_are_all_you_need_basic_split` - A config for replicating the Python textbook data from Textbooks Are All You Need [2]

### Example generated data
<img width="776" alt="Screenshot 2023-09-17 at 11 11 18 PM" src="https://github.com/emrgnt-cmplxty/SciPhi/assets/68796651/8f1ef11d-cd37-4fc7-a7a0-a1e0159ba4a3">
Expand Down
6 changes: 4 additions & 2 deletions sciphi/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def __init__(
self.config = yaml.safe_load(file)
self.main_path = os.path.dirname(main_config_path)
self.generator_mode: str = self.config.pop(generator_mode_field)

# Unpack the prompt config
self.prompt_templates: dict[str, int] = self.config.pop(
prompt_templates_field
Expand Down Expand Up @@ -76,7 +75,10 @@ def _load_configs(self) -> None:
entry = sub_config[key]

# Unroll dependencies (only single layers are supported now)
if key in self.prompt_template_input_dependencies:
if (
self.prompt_template_input_dependencies
and key in self.prompt_template_input_dependencies
):
entry = entry[
self.prompt_template_input_dependencies[key]
]
Expand Down
33,078 changes: 33,078 additions & 0 deletions sciphi/data/stock_config/textbooks_are_all_you_need_evol/evol_grade_school.yaml

Large diffs are not rendered by default.

33,784 changes: 33,784 additions & 0 deletions sciphi/data/stock_config/textbooks_are_all_you_need_evol/evol_seminar_i.yaml

Large diffs are not rendered by default.

107 changes: 107 additions & 0 deletions sciphi/data/stock_config/textbooks_are_all_you_need_evol/main.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion sciphi/examples/basic_data_gen/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import argparse
import hashlib
import os
import random
import secrets
import time

from sciphi.config import DataConfig, DataGeneratorMode
from sciphi.core.utils import (
Expand All @@ -26,6 +28,8 @@
)
from sciphi.writers import JsonlDataWriter

random.seed(time.time())

OUTPUT_FILE_NAME = "{RUN_NAME}__provider_eq_{PROVIDER}__model_eq_{MODEL}__version_eq_{VERSION}{EXTRA}.jsonl"


Expand All @@ -44,10 +48,11 @@ def generate_random_hash() -> str:
prep_for_file_path(args.provider_name),
prep_for_file_path(args.model_name),
)

if not os.path.exists(output_dir):
os.makedirs(output_dir)

# TODO - Fail check if path does not exist after attempted creation

if not args.output_file_name:
output_file_name = OUTPUT_FILE_NAME.format(
**{
Expand Down
2 changes: 1 addition & 1 deletion sciphi/examples/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def parse_arguments() -> argparse.Namespace:
parser.add_argument(
"--example_config",
type=str,
default="textbooks_are_all_you_need",
default="textbooks_are_all_you_need_basic_split",
help="Which configuration to use for data generation?",
)
parser.add_argument(
Expand Down
1 change: 0 additions & 1 deletion sciphi/llm/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ def get_config_for_provider(
provider_name: ProviderName,
) -> Type[LLMConfig]:
"""Get the configuration class for a given model."""
print("config_registry = ", LLMConfigManager.config_registry)
config_class = LLMConfigManager.config_registry.get(provider_name)

if not config_class:
Expand Down
8 changes: 8 additions & 0 deletions sciphi/makers/data_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ def __init__(
prompt_generator: PromptGenerator,
outer_prompt: Prompt,
dataset_name: Optional[str] = None,
dataset_filters: Optional[dict[str, str]] = None,
) -> None:
self.generator_mode = generator_mode
self.prompt_generator = prompt_generator
self.outer_prompt = outer_prompt
self.dataset_name = dataset_name
self.dataset_filters = dataset_filters

def synthetic_generator(
self, batch_size: int, num_samples: int
Expand Down Expand Up @@ -65,6 +67,12 @@ def hf_dataset_generator(

dataset = load_dataset(self.dataset_name, streaming=True)

# filter the dataset, on key and list of filter values
if self.dataset_filters:
for key, values in self.dataset_filters.items():
dataset = dataset.filter(
lambda example: example[key] in values,
)
counter = 0
for data in dataset["train"]:
inner_prompt = self.prompt_generator.generate_prompt(
Expand Down
12 changes: 8 additions & 4 deletions sciphi/prompt/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def _random_sample(vars_and_weights: dict) -> str:

def generate_prompt(self, optional_formatters=None) -> dict:
"""Return a prompt and its inputs."""

# Build the prompt formatters
formatters: dict[str, str] = optional_formatters or {}

Expand All @@ -47,14 +46,20 @@ def generate_prompt(self, optional_formatters=None) -> dict:
# It can be `None` if there are no dependencies
for prompt_input in self.prompt_inputs:
entry = self.config[prompt_input]
if prompt_input in self.prompt_dataset_dependencies:
if (
self.prompt_dataset_dependencies
and prompt_input in self.prompt_dataset_dependencies
):
# Parse dataset dependencies
entry = PromptGenerator._random_sample(entry)

self._insert_formatter(
formatters, prompt_input, optional_formatters[entry]
)
elif prompt_input in self.prompt_template_input_dependencies:
elif (
self.prompt_template_input_dependencies
and prompt_input in self.prompt_template_input_dependencies
):
# Parse single depth dependencies
dependent_on = self.prompt_template_input_dependencies[
prompt_input
Expand All @@ -69,7 +74,6 @@ def generate_prompt(self, optional_formatters=None) -> dict:
self._insert_formatter(formatters, prompt_input, entry)

prompt = PromptGenerator._random_sample(self.prompt_templates)

return {
PromptGenerator.RAW_PROMPT_TAG: prompt,
PromptGenerator.FORMATTED_PROMPT_TAG: prompt.format_map(
Expand Down

0 comments on commit c24cc1f

Please sign in to comment.