diff --git a/airoboros/instructors/coding.py b/airoboros/instructors/coding.py index 00ec711..98f0c7c 100644 --- a/airoboros/instructors/coding.py +++ b/airoboros/instructors/coding.py @@ -43,10 +43,11 @@ async def generate(instructor): if batch_size is None: batch_size = instructor.default_batch_size batch_size = int(batch_size) - count = instructor.instructor_counts.get("coding", 0) + if "coding" not in instructor.instructor_counts: + instructor.instructor_counts["coding"] = 0 language_index = 0 language = config.get("language") or instructor.language - while count < target_count: + while instructor.instructor_counts["coding"] < target_count: # Inject languages to use for this batch. current_languages = [] for _ in range(batch_size): @@ -111,7 +112,11 @@ async def generate(instructor): instructions.append( instruction if not plain else instruction + " PLAINFORMAT" ) - futures.append(instructor.generate_response(full_instruction, **api_params)) + futures.append( + instructor.generate_response( + full_instruction, filter_response=False, **api_params + ) + ) if not futures: continue responses = await asyncio.gather(*futures) @@ -128,6 +133,5 @@ async def generate(instructor): "response": response.strip(), "category": "coding", } - count += 1 - if count >= target_count: + if instructor.instructor_counts["coding"] >= target_count: break diff --git a/airoboros/instructors/contextual.py b/airoboros/instructors/contextual.py index 997b35a..0f66d2c 100644 --- a/airoboros/instructors/contextual.py +++ b/airoboros/instructors/contextual.py @@ -180,13 +180,14 @@ async def generate(instructor): min_score = float(min_score) # Generate the instruction/response pairs until we reach the target count. - count = instructor.instructor_counts.get("contextual", 0) + if "contextual" not in instructor.instructor_counts: + instructor.instructor_counts["contextual"] = 0 batch_size = config.get("batch_size") if batch_size is None: batch_size = instructor.default_batch_size batch_size = int(batch_size) futures = [] - while count < target_count: + while instructor.instructor_counts["contextual"] < target_count: prompt = generate_prompt(instructor, config, template, topic_iter) futures.append(instructor.generate_response(prompt, **api_params)) if len(futures) < batch_size: @@ -223,7 +224,6 @@ async def generate(instructor): "response": response.strip(), "category": "contextual", } - count += 1 - if count >= target_count: + if instructor.instructor_counts["contextual"] >= target_count: break futures = [] diff --git a/airoboros/instructors/counterfactual_contextual.py b/airoboros/instructors/counterfactual_contextual.py index 1fbbddd..9d08f19 100644 --- a/airoboros/instructors/counterfactual_contextual.py +++ b/airoboros/instructors/counterfactual_contextual.py @@ -55,13 +55,14 @@ async def generate(instructor): min_score = float(min_score) # Generate the instruction/response pairs until we reach the target count. - count = instructor.instructor_counts.get("counterfactual_contextual", 0) + if "counterfactual_contextual" not in instructor.instructor_counts: + instructor.instructor_counts["counterfactual_contextual"] = 0 batch_size = config.get("batch_size") if batch_size is None: batch_size = instructor.default_batch_size batch_size = int(batch_size) language = config.get("language") or instructor.language - while count < target_count: + while instructor.instructor_counts["counterfactual_contextual"] < target_count: response = await instructor.generate_response( template.format(batch_size=batch_size, language=language), **api_params ) @@ -164,7 +165,9 @@ async def generate(instructor): "response": response.strip(), "category": "counterfactual_contextual", } - count += 1 - if count >= target_count: + if ( + instructor.instructor_counts["counterfactual_contextual"] + >= target_count + ): break futures = [] diff --git a/airoboros/instructors/experience.py b/airoboros/instructors/experience.py index f19adf9..5b92f8c 100644 --- a/airoboros/instructors/experience.py +++ b/airoboros/instructors/experience.py @@ -30,16 +30,19 @@ async def generate(instructor): min_score = float(min_score) # Generate the instruction/response pairs until we reach the target count. - count = instructor.instructor_counts.get("experience", 0) + if "experience" not in instructor.instructor_counts: + instructor.instructor_counts["experience"] = 0 language = config.get("language") or instructor.language batch_size = config.get("batch_size") if batch_size is None: batch_size = instructor.default_batch_size batch_size = int(batch_size) futures = [] - while count < target_count: + while instructor.instructor_counts["experience"] < target_count: futures.append( - instructor.generate_response(prompt.format(language=language), **api_params) + instructor.generate_response( + prompt.format(language=language), filter_response=False, **api_params + ) ) if len(futures) < batch_size: continue @@ -67,7 +70,6 @@ async def generate(instructor): "response": response, "category": "experience", } - count += 1 - if count >= target_count: + if instructor.instructor_counts["experience"] >= target_count: break futures = [] diff --git a/airoboros/instructors/general.py b/airoboros/instructors/general.py index 9324560..ae21e9c 100644 --- a/airoboros/instructors/general.py +++ b/airoboros/instructors/general.py @@ -41,9 +41,10 @@ async def generate(instructor): if batch_size is None: batch_size = instructor.default_batch_size batch_size = int(batch_size) - count = instructor.instructor_counts.get("general", 0) + if "general" not in instructor.instructor_counts: + instructor.instructor_counts["general"] = 0 language = config.get("language") or instructor.language - while count < target_count: + while instructor.instructor_counts["general"] < target_count: # Inject the topics to use for this batch. current_topics = [] for _ in range(batch_size): @@ -93,6 +94,5 @@ async def generate(instructor): "response": response.strip(), "category": "general", } - count += 1 - if count >= target_count: + if instructor.instructor_counts["general"] >= target_count: break diff --git a/airoboros/instructors/inline_qa.py b/airoboros/instructors/inline_qa.py index 588036d..bd64e00 100644 --- a/airoboros/instructors/inline_qa.py +++ b/airoboros/instructors/inline_qa.py @@ -37,9 +37,10 @@ async def generate( if batch_size is None: batch_size = instructor.default_batch_size batch_size = int(batch_size) - count = instructor.instructor_counts.get(category, 0) + if category not in instructor.instructor_counts: + instructor.instructor_counts[category] = 0 language = config.get("language") or instructor.language - while count < target_count: + while instructor.instructor_counts[category] < target_count: # Get a batch of instructions. prompt_args = {"language": language} if "{batch_size}" in template: @@ -66,6 +67,5 @@ async def generate( "response": response.strip(), "category": category, } - count += 1 - if count >= target_count: + if instructor.instructor_counts[category] >= target_count: break diff --git a/airoboros/instructors/simple_task.py b/airoboros/instructors/simple_task.py index 6694203..6fef7b6 100644 --- a/airoboros/instructors/simple_task.py +++ b/airoboros/instructors/simple_task.py @@ -45,9 +45,10 @@ async def generate(instructor, category): if batch_size is None: batch_size = instructor.default_batch_size batch_size = int(batch_size) - count = instructor.instructor_counts.get(category, 0) + if category not in instructor.instructor_counts: + instructor.instructor_counts[category] = 0 language = config.get("language") or instructor.language - while count < target_count: + while instructor.instructor_counts[category] < target_count: # Get a batch of instructions. prompt = ( template.format(batch_size=batch_size, language=language) @@ -89,6 +90,5 @@ async def generate(instructor, category): "response": response.strip(), "category": category, } - count += 1 - if count >= target_count: + if instructor.instructor_counts[category] >= target_count: break diff --git a/airoboros/self_instruct.py b/airoboros/self_instruct.py index f1ceb5c..891cc58 100644 --- a/airoboros/self_instruct.py +++ b/airoboros/self_instruct.py @@ -408,6 +408,7 @@ def persist(self, item): self.outfile.flush() self.docstores[-1].add_texts([item["instruction"]]) self.docstore_size += 1 + self.instructor_counts[item["category"]] += 1 if self.docstore_size >= MAX_DOCSTORE_SIZE: logger.info("Initializing new docstore...") self.docstores.append( diff --git a/setup.py b/setup.py index 00cf2c7..bc71b9e 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="airoboros", - version="2.0.7", + version="2.0.8", description="Updated and improved implementation of the self-instruct system.", long_description=long_description, long_description_content_type="text/markdown",