Skip to content

Commit

Permalink
Add sanitization to strip LLM control tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
acusti committed May 10, 2024
1 parent 20a0e98 commit 25173c4
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion packages/parsing/src/parse-as-json.ts
Original file line number Diff line number Diff line change
Expand Up @@ -183,13 +183,16 @@ const getObjectKeyFromIndex = (index: number) =>

const OBJECT_KEY_REGEXP = /^\s*"[^"]+":/;

const CONTROL_TOKENS_REGEXP = /(^<\|im_start\|>|<\|im_end\|>$)/;

type ParsedValue = string | boolean | number | GenericObject | Array<unknown>;

export function parseAsJSON(text: string): ParsedValue | null {
// if the input is undefined/null, return null to indicate failure
if (text == null) return null;

// attempt to parse the string as-is
text = text.replace(CONTROL_TOKENS_REGEXP, '');
// attempt to parse the string as-is (minus control tokens)
try {
return JSON.parse(text) as ParsedValue;
} catch (error) {
Expand Down

0 comments on commit 25173c4

Please sign in to comment.