Feature/change source (#29)

* modify llama index source * tweaks for improved gen * t com * Add prompt configs * cleanUP * cleanUP * mod * lint
SciPhi-AI · Sep 25, 2023 · c24cc1f · c24cc1f
1 parent 4db0701
commit c24cc1f
Show file tree

Hide file tree

Showing 34 changed files with 67,002 additions and 13 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,3 +1,4 @@
+*.yaml linguist-generated=true
 *.tex linguist-generated=true
 *.csv linguist-generated=true
 *.json linguist-generated=true

diff --git a/.github/scripts/check_prompt_generators.py b/.github/scripts/check_prompt_generators.py
@@ -9,7 +9,8 @@
 if __name__ == "__main__":
     data_config = DataConfig(
         os.path.join(
-            get_data_config_dir(), "textbooks_are_all_you_need/main.yaml"
+            get_data_config_dir(),
+            "textbooks_are_all_you_need_basic_split/main.yaml",
         )
     )
     prompt_generator = PromptGenerator(

diff --git a/README.md b/README.md
@@ -58,15 +58,15 @@ poetry install -E <extra_name>
 You can use SciPhi for dataset generation by executing the relevant `runner.py` file with various command-line arguments.
 
 ```bash
-poetry run python sciphi/examples/data_generation/runner.py --provider_name=openai --model_name=gpt-4 --log_level=DEBUG --batch_size=1 --num_samples=1 --output_file_name=example_output.jsonl --example_config=textbooks_are_all_you_need
+poetry run python sciphi/examples/basic_data_gen/runner.py --provider_name=openai --model_name=gpt-4 --log_level=DEBUG --batch_size=1 --num_samples=1 --output_file_name=example_output.jsonl --example_config=textbooks_are_all_you_need_basic_split
 ```
 
 ### Key Command-Line Arguments
 
 - `--provider`: Which provider to use for completions (default: "openai").
 - `--model_name`: The name of the model to load from the provider (default: "gpt-3.5-turbo").
 - `--temperature`: Temperature parameter for the provided model (default: 0.7).
-- `--example_config`: Which example configuration to use (default: "textbooks_are_all_you_need").
+- `--example_config`: Which example configuration to use (default: "textbooks_are_all_you_need_basic_split").
 - `--override_config_path`: Used to override the example configurations with custom config.
 - `--num_samples`: Number of samples to generate (default: 1_024).
 - `--output_dir`: File path to override the default output output file path with.
@@ -75,7 +75,7 @@ poetry run python sciphi/examples/data_generation/runner.py --provider_name=open
 ### Stock data configs
 
 - `evol_instruct` - A config for replicating the EvolInstruct dataset
-- `textbooks_are_all_you_need` - A config for replicating the Python textbook data from Textbooks Are All You Need [2]
+- `textbooks_are_all_you_need_basic_split` - A config for replicating the Python textbook data from Textbooks Are All You Need [2]
 
 ### Example generated data
 <img width="776" alt="Screenshot 2023-09-17 at 11 11 18 PM" src="https://github.com/emrgnt-cmplxty/SciPhi/assets/68796651/8f1ef11d-cd37-4fc7-a7a0-a1e0159ba4a3">

diff --git a/sciphi/config/config.py b/sciphi/config/config.py
@@ -25,7 +25,6 @@ def __init__(
             self.config = yaml.safe_load(file)
         self.main_path = os.path.dirname(main_config_path)
         self.generator_mode: str = self.config.pop(generator_mode_field)
-
         # Unpack the prompt config
         self.prompt_templates: dict[str, int] = self.config.pop(
             prompt_templates_field
@@ -76,7 +75,10 @@ def _load_configs(self) -> None:
                     entry = sub_config[key]
 
                     # Unroll dependencies (only single layers are supported now)
-                    if key in self.prompt_template_input_dependencies:
+                    if (
+                        self.prompt_template_input_dependencies
+                        and key in self.prompt_template_input_dependencies
+                    ):
                         entry = entry[
                             self.prompt_template_input_dependencies[key]
                         ]

diff --git a/...need/advanced_cybersecurity_concepts.yaml → ...plit/advanced_cybersecurity_concepts.yaml b/...need/advanced_cybersecurity_concepts.yaml → ...plit/advanced_cybersecurity_concepts.yaml
diff --git a/.../advanced_mathematics_and_statistics.yaml → .../advanced_mathematics_and_statistics.yaml b/.../advanced_mathematics_and_statistics.yaml → .../advanced_mathematics_and_statistics.yaml
diff --git a/...oks_are_all_you_need/advanced_python.yaml → ...you_need_basic_split/advanced_python.yaml b/...oks_are_all_you_need/advanced_python.yaml → ...you_need_basic_split/advanced_python.yaml
diff --git a/...tbooks_are_all_you_need/basic_python.yaml → ...ll_you_need_basic_split/basic_python.yaml b/...tbooks_are_all_you_need/basic_python.yaml → ...ll_you_need_basic_split/basic_python.yaml
diff --git a/...ou_need/cloud_computing_and_big_data.yaml → ...c_split/cloud_computing_and_big_data.yaml b/...ou_need/cloud_computing_and_big_data.yaml → ...c_split/cloud_computing_and_big_data.yaml
diff --git a/...ed/cybersecurity_threats_and_defense.yaml → ...it/cybersecurity_threats_and_defense.yaml b/...ed/cybersecurity_threats_and_defense.yaml → ...it/cybersecurity_threats_and_defense.yaml
diff --git a/...eed/emerging_data_science_techniques.yaml → ...lit/emerging_data_science_techniques.yaml b/...eed/emerging_data_science_techniques.yaml → ...lit/emerging_data_science_techniques.yaml
diff --git a/..._need/extended_algorithms_and_design.yaml → ...split/extended_algorithms_and_design.yaml b/..._need/extended_algorithms_and_design.yaml → ...split/extended_algorithms_and_design.yaml
diff --git a/...need/foundations_of_computer_science.yaml → ...plit/foundations_of_computer_science.yaml b/...need/foundations_of_computer_science.yaml → ...plit/foundations_of_computer_science.yaml
diff --git a/...ll_you_need/intermediate_mathematics.yaml → ...basic_split/intermediate_mathematics.yaml b/...ll_you_need/intermediate_mathematics.yaml → ...basic_split/intermediate_mathematics.yaml
diff --git a/...are_all_you_need/intermediate_python.yaml → ...need_basic_split/intermediate_python.yaml b/...are_all_you_need/intermediate_python.yaml → ...need_basic_split/intermediate_python.yaml
diff --git a/...ll_you_need/introductory_mathematics.yaml → ...basic_split/introductory_mathematics.yaml b/...ll_you_need/introductory_mathematics.yaml → ...basic_split/introductory_mathematics.yaml
diff --git a/...nfig/textbooks_are_all_you_need/main.yaml → ...ks_are_all_you_need_basic_split/main.yaml b/...nfig/textbooks_are_all_you_need/main.yaml → ...ks_are_all_you_need_basic_split/main.yaml
diff --git a/...are_all_you_need/mathematical_python.yaml → ...need_basic_split/mathematical_python.yaml b/...are_all_you_need/mathematical_python.yaml → ...need_basic_split/mathematical_python.yaml
diff --git a/..._need/mobile_application_development.yaml → ...split/mobile_application_development.yaml b/..._need/mobile_application_development.yaml → ...split/mobile_application_development.yaml
diff --git a/...l_you_need/modern_computing_concepts.yaml → ...asic_split/modern_computing_concepts.yaml b/...l_you_need/modern_computing_concepts.yaml → ...asic_split/modern_computing_concepts.yaml
diff --git a/...ks_are_all_you_need/pure_mathematics.yaml → ...ou_need_basic_split/pure_mathematics.yaml b/...ks_are_all_you_need/pure_mathematics.yaml → ...ou_need_basic_split/pure_mathematics.yaml
diff --git a/...s_are_all_you_need/python_for_gaming.yaml → ...u_need_basic_split/python_for_gaming.yaml b/...s_are_all_you_need/python_for_gaming.yaml → ...u_need_basic_split/python_for_gaming.yaml
diff --git a/...e_all_you_need/python_for_networking.yaml → ...ed_basic_split/python_for_networking.yaml b/...e_all_you_need/python_for_networking.yaml → ...ed_basic_split/python_for_networking.yaml
diff --git a/...ll_you_need/python_for_practitioners.yaml → ...basic_split/python_for_practitioners.yaml b/...ll_you_need/python_for_practitioners.yaml → ...basic_split/python_for_practitioners.yaml
diff --git a/...ll_you_need/software_design_patterns.yaml → ...basic_split/software_design_patterns.yaml b/...ll_you_need/software_design_patterns.yaml → ...basic_split/software_design_patterns.yaml
diff --git a/..._you_need/web_development_founations.yaml → ...sic_split/web_development_founations.yaml b/..._you_need/web_development_founations.yaml → ...sic_split/web_development_founations.yaml
diff --git a/sciphi/data/stock_config/textbooks_are_all_you_need_evol/evol_grade_school.yaml b/sciphi/data/stock_config/textbooks_are_all_you_need_evol/evol_grade_school.yaml
diff --git a/sciphi/data/stock_config/textbooks_are_all_you_need_evol/evol_seminar_i.yaml b/sciphi/data/stock_config/textbooks_are_all_you_need_evol/evol_seminar_i.yaml
diff --git a/sciphi/data/stock_config/textbooks_are_all_you_need_evol/main.yaml b/sciphi/data/stock_config/textbooks_are_all_you_need_evol/main.yaml
diff --git a/sciphi/examples/basic_data_gen/runner.py b/sciphi/examples/basic_data_gen/runner.py
@@ -2,7 +2,9 @@
 import argparse
 import hashlib
 import os
+import random
 import secrets
+import time
 
 from sciphi.config import DataConfig, DataGeneratorMode
 from sciphi.core.utils import (
@@ -26,6 +28,8 @@
 )
 from sciphi.writers import JsonlDataWriter
 
+random.seed(time.time())
+
 OUTPUT_FILE_NAME = "{RUN_NAME}__provider_eq_{PROVIDER}__model_eq_{MODEL}__version_eq_{VERSION}{EXTRA}.jsonl"
 
 
@@ -44,10 +48,11 @@ def generate_random_hash() -> str:
         prep_for_file_path(args.provider_name),
         prep_for_file_path(args.model_name),
     )
-
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
+    # TODO - Fail check if path does not exist after attempted creation
+
     if not args.output_file_name:
         output_file_name = OUTPUT_FILE_NAME.format(
             **{

diff --git a/sciphi/examples/helpers.py b/sciphi/examples/helpers.py
@@ -212,7 +212,7 @@ def parse_arguments() -> argparse.Namespace:
     parser.add_argument(
         "--example_config",
         type=str,
-        default="textbooks_are_all_you_need",
+        default="textbooks_are_all_you_need_basic_split",
         help="Which configuration to use for data generation?",
     )
     parser.add_argument(

diff --git a/sciphi/llm/config_manager.py b/sciphi/llm/config_manager.py
@@ -15,7 +15,6 @@ def get_config_for_provider(
         provider_name: ProviderName,
     ) -> Type[LLMConfig]:
         """Get the configuration class for a given model."""
-        print("config_registry = ", LLMConfigManager.config_registry)
         config_class = LLMConfigManager.config_registry.get(provider_name)
 
         if not config_class:

diff --git a/sciphi/makers/data_maker.py b/sciphi/makers/data_maker.py
@@ -18,11 +18,13 @@ def __init__(
         prompt_generator: PromptGenerator,
         outer_prompt: Prompt,
         dataset_name: Optional[str] = None,
+        dataset_filters: Optional[dict[str, str]] = None,
     ) -> None:
         self.generator_mode = generator_mode
         self.prompt_generator = prompt_generator
         self.outer_prompt = outer_prompt
         self.dataset_name = dataset_name
+        self.dataset_filters = dataset_filters
 
     def synthetic_generator(
         self, batch_size: int, num_samples: int
@@ -65,6 +67,12 @@ def hf_dataset_generator(
 
         dataset = load_dataset(self.dataset_name, streaming=True)
 
+        # filter the dataset, on key and list of filter values
+        if self.dataset_filters:
+            for key, values in self.dataset_filters.items():
+                dataset = dataset.filter(
+                    lambda example: example[key] in values,
+                )
         counter = 0
         for data in dataset["train"]:
             inner_prompt = self.prompt_generator.generate_prompt(

diff --git a/sciphi/prompt/generator.py b/sciphi/prompt/generator.py
@@ -37,7 +37,6 @@ def _random_sample(vars_and_weights: dict) -> str:
 
     def generate_prompt(self, optional_formatters=None) -> dict:
         """Return a prompt and its inputs."""
-
         # Build the prompt formatters
         formatters: dict[str, str] = optional_formatters or {}
 
@@ -47,14 +46,20 @@ def generate_prompt(self, optional_formatters=None) -> dict:
         # It can be `None` if there are no dependencies
         for prompt_input in self.prompt_inputs:
             entry = self.config[prompt_input]
-            if prompt_input in self.prompt_dataset_dependencies:
+            if (
+                self.prompt_dataset_dependencies
+                and prompt_input in self.prompt_dataset_dependencies
+            ):
                 # Parse dataset dependencies
                 entry = PromptGenerator._random_sample(entry)
 
                 self._insert_formatter(
                     formatters, prompt_input, optional_formatters[entry]
                 )
-            elif prompt_input in self.prompt_template_input_dependencies:
+            elif (
+                self.prompt_template_input_dependencies
+                and prompt_input in self.prompt_template_input_dependencies
+            ):
                 # Parse single depth dependencies
                 dependent_on = self.prompt_template_input_dependencies[
                     prompt_input
@@ -69,7 +74,6 @@ def generate_prompt(self, optional_formatters=None) -> dict:
                 self._insert_formatter(formatters, prompt_input, entry)
 
         prompt = PromptGenerator._random_sample(self.prompt_templates)
-
         return {
             PromptGenerator.RAW_PROMPT_TAG: prompt,
             PromptGenerator.FORMATTED_PROMPT_TAG: prompt.format_map(