Skip to content

Commit

Permalink
booststrap in progress
Browse files Browse the repository at this point in the history
  • Loading branch information
mike dupont committed Feb 17, 2024
1 parent c098588 commit d440108
Show file tree
Hide file tree
Showing 4 changed files with 294 additions and 113 deletions.
127 changes: 87 additions & 40 deletions bin/simple_grammar.ml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
open Lang_agent


type backend =
| BNone
| BLlamaCpp of Llama_cpp.llama_cpp_lang_model
Expand All @@ -15,47 +16,90 @@ let lc_lang_prompt lang_client param_record prompt =
| _ -> "")
| _ -> ""

let process_prompt: backend -> 'client_t2 -> string -> string -> string -> string -> int ->unit =
fun client1 param_record path model prompt1 suffix repeat ->
(print_endline ("Consider model: " ^ model ^ " path: "^ path));
let aux dir =
let full_path = dir in
let full_out_path = full_path ^ suffix in
let do_one prompt1 client1 param_record full_out_path=
let prompt = prompt1 in
(print_endline ("chunk: " ^ prompt));
let res = (lc_lang_prompt client1 param_record prompt ) in
print_endline ("OUTPUT: " ^ full_out_path);
if res == "ERROR"
then
(
print_endline ("ERROR: " ^ full_out_path);
"erro"
)
else
(
let oc = open_out full_out_path in
(Printf.fprintf oc
"\n#+begin_src input\n%s\n#+end_src\n#+begin_src output\n%s\n#+end_src\n"
prompt res);
close_out oc;
res
)


let window_size = ref 1024

let split_file ic n =
let chunks = ref [] in
let chunk = ref 0 in
let buf = Buffer.create 1024 in
let eof = ref false in
let line = ref "" in
while not !eof do
incr chunk;
let lines = ref 0 in
while not !eof && !lines < n do
try
line := input_line ic;
Buffer.add_string buf !line;
Buffer.add_char buf '\n';
incr lines
with
End_of_file ->
eof := true
done;

chunks := List.append !chunks [ Buffer.contents buf ];
Buffer.clear buf;
if not ! eof then
(*append the last line as a chunk*)
chunks := List.append !chunks [ !line ]
(* clear the buffer *)

if Sys.file_exists full_out_path then
print_endline ("SKIP existing" ^ full_out_path)
else
let do_one (data)=
let prompt = prompt1 ^ data in
(* print_endline ("send" ^ prompt); *)
let res = (lc_lang_prompt client1 param_record prompt ) in

print_endline ("OUTPUT: " ^ full_out_path);

if res == "ERROR"
then
print_endline ("ERROR: " ^ full_out_path)
else
(
let oc = open_out full_out_path in
Printf.fprintf oc
"\n#+begin_src input\n%s\n#+end_src\n#+begin_src output %s\n%s\n#+end_src\n"
data model res;
close_out oc;
);

"FIXME"
done;
(* close the input file *)
chunks

in

let _ = do_one prompt1 in
()

in
for i = 1 to repeat do
aux (path ^ "_" ^(string_of_int i))
done
let do_split_file full_path =
let ic = open_in full_path in
print_endline ("OPEN INPUT: " ^ full_path);
let chunks = split_file ic ! window_size in
close_in ic;
chunks

let aux dir suffix prompt1 client1 param_record =
let full_path = dir in
let full_out_path = full_path ^ suffix in
if Sys.file_exists full_out_path then
(
print_endline ("SKIP existing" ^ full_out_path);
"error"
)
else
(
print_endline ("going to create" ^ full_out_path);
do_one prompt1 client1 param_record full_out_path
)

let process_prompt: backend -> 'client_t2 -> string -> string -> string -> string -> int ->unit =
fun client1 param_record path model prompt1 suffix repeat ->
(print_endline ("Consider model: " ^ model ^ " path: "^ path));
for i = 1 to repeat do
let _ = aux (path ^ "_" ^(string_of_int i)) suffix prompt1 client1 param_record in
()
done

let anon_fun _ = ()

Expand All @@ -70,6 +114,7 @@ let lc_init lang_client aurl amodel agrammar=
let c3 = m#lang_set_grammar c2 agrammar in
B2LlamaCpp c3



let read_whole_file filename =
let ch = open_in_bin filename in
Expand Down Expand Up @@ -103,6 +148,8 @@ let () =
Printf.printf "DEBUG3 path %s\n" !start;
(print_endline ("DEBUG4 MODEL :" ^ ! model) );
grammar := read_whole_file !grammar;
prompt := read_whole_file !prompt;
let client_param_record = lc_init !lang_client !url !model !grammar in
process_prompt !lang_client client_param_record !start !model !prompt !suffix !item_count
let chunks = do_split_file !prompt in
let client_param_record = lc_init !lang_client !url !model !grammar in
let do_one p = process_prompt !lang_client client_param_record !start !model p !suffix !item_count in
let _ = (List.map do_one ! chunks) in ()

11 changes: 5 additions & 6 deletions grammar2.sh
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
GRAMMAR=~/experiments/gbnf_parser/grammars/ebnf.ebnf
GRAMMAR=./grammars/ebnf.ebnf
GRAMMAR_C=$(cat $GRAMMAR)

GRAMMAR2=~/experiments/gbnf_parser/lib/sentenceParser.mly
GRAMMAR2=./grammars/sentenceParser.mly
GRAMMAR2_C=$( cat $GRAMMAR2 )

DATA=$(cat notes.org)
DS=$(date -Iseconds)
PROMPT_NAME="prompt_grammar2_${DS}.txt"
PROMPT_NAME="prompt_grammar3.txt"

echo "Consider the following grammar between BEGINSRC and ENDSRC. BEGINSRC ${GRAMMAR2_C} ENDSRC . Please rewrite it to be more beautiful. We are going to use the following TARGET: BEGINTARGET ${GRAMMAR_C} ENDTARGET as our target grammar format. Please rewrite SRC into TARGET. " > $PROMPT_NAME

dune exec bin/simple_grammar.exe -- \
--llamacpp \
-u "http://localhost:8080" \
-s "grammar_1_${DS}" \
-s "data/grammar/grammar_1_${DS}" \
-g $GRAMMAR \
-p $PROMPT_NAME \
-x ".txt" \
-n 6
-n 20
189 changes: 189 additions & 0 deletions grammars/sentenceParser.mly
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
%{
open Syntax
%}

%token <int> Tchar
%token DASH "-"
%token CARET "^"
%token
BAR "|"
EOF ""
LPAREN "("
RPAREN ")"
QUESTION "?"
STAR "*"
PLUS "+"
NEWLINE

%token <string Positions.located>
LID "lident"
REGEX "regex"
QID "\"alias\""

%token
COLONCOLONEQUAL "::="

%start <Syntax.partial_grammar> grammar
%type <Syntax.myfactor> factor
%type <Syntax.myfactor> alternation
%type <Syntax.myfactor> modifier
%type <Syntax.myfactor> complexterms
%type <Syntax.myfactor> term
%type <Syntax.myfactor> fstar
%type <Syntax.myfactor> sterm
%type <Syntax.myfactor> char_class
%type <string Positions.located> lid
%type <string Positions.located> qid
%type <Syntax.myfactor> termfactor

%%

rules:
| rules NEWLINE+ rule
{
(print_endline (Batteries.dump ("DEBUG:OLDRULE1",$3)));
Rule $3
}

| NEWLINE+ rule {
(print_endline (Batteries.dump ("DEBUG:OLDRULE",$1)));
Rule $2
}
|rule {
(print_endline (Batteries.dump ("DEBUG:OLDRULE",$1)));
Rule $1
}

grammar:
rs = rules postlude
{
(print_endline (Batteries.dump ("DEBUG:grammar",rs, $2)));
{
pg_filename = ""; (* filled in by the caller *)
pg_rules = [];
}
}

rule:
symbol = LID
/* the symbol that is being defined */
COLONCOLONEQUAL
branches = rhs
{
(print_endline (Batteries.dump ("DEBUG:rule", symbol, branches)));
{
pr_nt = Positions.value symbol;
pr_positions = [ Positions.position symbol ];
pr_branches = [] (*Fixme should be brancheS*)
}
}

postlude:
NEWLINE*
EOF
{
(print_endline (Batteries.dump ("DEBUG:DONE")))
}

located(X):
x = X
{ with_loc $loc x }

%inline qid:
| QID { (print_endline (Batteries.dump ("DEBUG:quid", $1))); $1 }
%inline lid:
| LID { (print_endline (Batteries.dump ("DEBUG:lid", $1))); $1 }

%inline sterm:
| qid { (print_endline (Batteries.dump ("DEBUG:sterm/quid", $1))); SFactor $1}
| lid { (print_endline (Batteries.dump ("DEBUG:sterm/lid", $1))); SFactor $1}

term:
| complexterms { (print_endline (Batteries.dump ("DEBUG:term/cterms", $1))); NFactor $1}
| sterm { (print_endline (Batteries.dump ("DEBUG:term/sterm", $1))); NFactor $1}

%inline complexterms:
| group1 { (print_endline (Batteries.dump ("DEBUG:cterm/group", $1))); NFactor $1}
| class1 { (print_endline (Batteries.dump ("DEBUG:cterm/class", $1))); NFactor $1}

%inline group1:
| LPAREN NEWLINE* rhs RPAREN { (print_endline (Batteries.dump ("DEBUG:rhs", $3))); NFactor $3}

%inline class1:
/* | LBRACE char_class RBRACE {} */
| char_class { (print_endline (Batteries.dump ("DEBUG:class1a", $1))); NFactor $1}
| REGEX { (print_endline (Batteries.dump ("DEBUG:class", $1))); SFactor $1}

%inline termfactor:
| term { (print_endline (Batteries.dump ("DEBUG:termfactor", $1))); NFactor $1}

factor:
| termfactor modifier {
(* (print_endline (Batteries.dump ("DEBUG:factormod", ($1,$2)))); *)
(* let foo = CFactor ($1, $2) *)
(* in foo
(CFactor ($1, $2) )
*)

NFactor $1
}

| termfactor {
(* (print_endline (Batteries.dump ("DEBUG:factor", $1))); *)
(* let foo = SFactor $1 in *)
(* foo *)
NFactor $1
}

%inline modifier:
| fplus { (print_endline (Batteries.dump ("DEBUG:mod", $1))); NFactor $1}
| fquest { (print_endline (Batteries.dump ("DEBUG:quest", $1))); NFactor $1}
| fstar { (print_endline (Batteries.dump ("DEBUG:star", $1))); NFactor $1}

%inline fstar:
| STAR {
(* (print_endline (Batteries.dump ("DEBUG:star", $1))); *)
Star
}
%inline fquest:
| QUESTION { (print_endline (Batteries.dump ("DEBUG:quest", $1))); Question}
%inline fplus:
| PLUS { (print_endline (Batteries.dump ("DEBUG:plus", $1))); Plus}

concatenation:
| concatenation factor { (print_endline (Batteries.dump ("DEBUG:concat1", $1))); NFactor $1}
| factor { (print_endline (Batteries.dump ("DEBUG:concat2", $1))); NFactor $1}

alternation:
| alternation BAR NEWLINE* concatenation { NFactor $1 }
| concatenation { (print_endline (Batteries.dump ("DEBUG:alt", $1))); NFactor $1}

rhs:
| alternation { (print_endline (Batteries.dump ("DEBUG:rhs", $1))); NFactor $1}


char_class:
CARET char_class1
/* { Cset.complement $2 } */
{ (print_endline (Batteries.dump ("DEBUG:ccrs",$2))) ; NFactor $2}
| char_class1
/* { $1 } */
{ (print_endline (Batteries.dump ("DEBUG:cc2rs",$1))); CharClass }
;
char_class1:
Tchar DASH Tchar
/* { Cset.interval $1 $3 } */
{ (print_endline (Batteries.dump ("DEBUG:cc3rs",$1,$2))); CharInt $1
(*fixme*)
}
| char_class1 Tchar
/* Cset.singleton $1 */
{ (print_endline (Batteries.dump ("DEBUG:cc4rs",$1))); NFactor $1 }
| Tchar
/* Cset.singleton $1 */
{ (print_endline (Batteries.dump ("DEBUG:cc5rs",$1))); CharInt $1 }
/* | char_class1 char_class1 CONCAT */
/* { Cset.union $1 $2 } */
;

%%
Loading

0 comments on commit d440108

Please sign in to comment.