booststrap in progress

meta-introspector · Feb 17, 2024 · d440108 · d440108
1 parent c098588
commit d440108
Show file tree

Hide file tree

Showing 4 changed files with 294 additions and 113 deletions.
diff --git a/bin/simple_grammar.ml b/bin/simple_grammar.ml
@@ -1,5 +1,6 @@
 open Lang_agent
 
+
 type backend =
   | BNone
   | BLlamaCpp of Llama_cpp.llama_cpp_lang_model
@@ -15,47 +16,90 @@ let lc_lang_prompt lang_client  param_record prompt =
                                             | _ -> "")
   | _ -> ""
 
-let process_prompt: backend -> 'client_t2 -> string -> string -> string -> string -> int ->unit =
-  fun client1 param_record path model prompt1 suffix repeat ->
-  (print_endline ("Consider model: " ^  model ^ " path: "^ path));
-  let aux dir =
-    let full_path = dir  in
-    let full_out_path = full_path ^ suffix in
+let do_one prompt1 client1 param_record full_out_path=
+  let prompt = prompt1  in
+  (print_endline ("chunk: " ^  prompt));
+  let res = (lc_lang_prompt client1 param_record prompt ) in
+  print_endline ("OUTPUT: " ^ full_out_path);        
+  if res == "ERROR"
+  then
+    (
+      print_endline ("ERROR: " ^ full_out_path);
+      "erro"
+    )
+  else
+    (
+      let oc = open_out full_out_path in
+      (Printf.fprintf oc
+        "\n#+begin_src input\n%s\n#+end_src\n#+begin_src output\n%s\n#+end_src\n"
+        prompt res);
+      close_out oc;
+      res
+    )
+
+
+let window_size = ref 1024
+
+let split_file ic n =
+  let chunks = ref [] in
+  let chunk = ref 0 in
+  let buf = Buffer.create 1024 in
+  let eof = ref false in
+  let line = ref "" in 
+  while not !eof do
+    incr chunk;
+    let lines = ref 0 in
+    while not !eof && !lines < n do
+      try
+        line := input_line ic;
+        Buffer.add_string buf !line;
+        Buffer.add_char buf '\n';
+        incr lines
+      with
+        End_of_file ->        
+          eof := true
+    done;
+
+    chunks := List.append !chunks  [ Buffer.contents buf ];
+    Buffer.clear buf;
+    if not ! eof then     
+      (*append the last line as a chunk*)
+      chunks := List.append !chunks  [ !line ]    
+    (* clear the buffer *)
 
-    if Sys.file_exists  full_out_path then
-      print_endline ("SKIP existing" ^ full_out_path)
-    else
-      let do_one  (data)=
-        let prompt = prompt1 ^ data in
-        (* print_endline ("send" ^ prompt); *)
-        let res = (lc_lang_prompt client1 param_record prompt ) in
-
-        print_endline ("OUTPUT: " ^ full_out_path);
-
-        if res == "ERROR"
-        then
-          print_endline ("ERROR: " ^ full_out_path)
-        else
-          (
-            let oc = open_out full_out_path in
-            Printf.fprintf oc
-              "\n#+begin_src input\n%s\n#+end_src\n#+begin_src output %s\n%s\n#+end_src\n"
-              data model res;
-            close_out oc;
-          );
-
-        "FIXME"
+  done;
+  (* close the input file *)
+  chunks
 
-      in
-
-        let _ = do_one prompt1 in
-        ()
 
-in
-for i = 1 to repeat do
-  aux (path ^ "_" ^(string_of_int i))
-done
+let do_split_file full_path =
+  let ic = open_in full_path in
+  print_endline ("OPEN INPUT: " ^ full_path);
+  let chunks = split_file ic ! window_size in
+  close_in ic;
+  chunks
 
+let aux dir suffix prompt1 client1 param_record =
+  let full_path = dir  in
+  let full_out_path = full_path ^ suffix in    
+  if Sys.file_exists  full_out_path then
+    (
+      print_endline ("SKIP existing" ^ full_out_path);
+      "error"
+    )
+  else
+    (
+      print_endline ("going to create" ^ full_out_path);
+      do_one prompt1 client1 param_record full_out_path
+    )
+
+let process_prompt: backend -> 'client_t2 -> string -> string -> string -> string -> int ->unit =
+  fun client1 param_record path model prompt1 suffix repeat ->
+  (print_endline ("Consider model: " ^  model ^ " path: "^ path));
+  for i = 1 to repeat do
+    let _ = aux (path ^ "_" ^(string_of_int i)) suffix prompt1 client1 param_record in
+    ()
+  done
 
 let anon_fun _ = ()
 
@@ -70,6 +114,7 @@ let lc_init lang_client aurl amodel agrammar=
     let c3 = m#lang_set_grammar c2 agrammar in
     B2LlamaCpp c3
 
+
 
 let read_whole_file filename =
   let ch = open_in_bin filename in
@@ -103,6 +148,8 @@ let () =
   Printf.printf "DEBUG3 path %s\n" !start;
   (print_endline ("DEBUG4 MODEL :" ^ ! model) );
   grammar := read_whole_file  !grammar;
-  prompt := read_whole_file  !prompt;
-    let client_param_record = lc_init !lang_client !url !model !grammar  in 
-    process_prompt !lang_client client_param_record !start !model !prompt !suffix !item_count
+  let chunks = do_split_file !prompt in
+  let client_param_record = lc_init !lang_client !url !model !grammar  in 
+  let do_one p = process_prompt !lang_client client_param_record !start !model p !suffix !item_count in
+  let _ = (List.map do_one ! chunks) in ()
+
diff --git a/grammar2.sh b/grammar2.sh
@@ -1,20 +1,19 @@
-GRAMMAR=~/experiments/gbnf_parser/grammars/ebnf.ebnf
+GRAMMAR=./grammars/ebnf.ebnf
 GRAMMAR_C=$(cat $GRAMMAR)
 
-GRAMMAR2=~/experiments/gbnf_parser/lib/sentenceParser.mly
+GRAMMAR2=./grammars/sentenceParser.mly
 GRAMMAR2_C=$( cat $GRAMMAR2 )
 
-DATA=$(cat notes.org)
 DS=$(date -Iseconds)
-PROMPT_NAME="prompt_grammar2_${DS}.txt"
+PROMPT_NAME="prompt_grammar3.txt"
 
 echo "Consider the following grammar between BEGINSRC and ENDSRC. BEGINSRC ${GRAMMAR2_C} ENDSRC . Please rewrite it to be more beautiful. We are going to use the following TARGET: BEGINTARGET ${GRAMMAR_C} ENDTARGET as our target grammar format. Please rewrite SRC into TARGET. " > $PROMPT_NAME
 
 dune exec bin/simple_grammar.exe -- \
     --llamacpp \
     -u "http://localhost:8080" \
-    -s "grammar_1_${DS}"   \
+    -s "data/grammar/grammar_1_${DS}"   \
     -g $GRAMMAR \
     -p $PROMPT_NAME \
     -x ".txt" \
-    -n 6
+    -n 20
diff --git a/grammars/sentenceParser.mly b/grammars/sentenceParser.mly
@@ -0,0 +1,189 @@
+%{
+open Syntax
+%}
+
+%token <int> Tchar
+%token DASH "-"
+%token CARET "^"
+%token
+  BAR              "|"
+  EOF              ""
+  LPAREN           "("
+  RPAREN           ")" 
+  QUESTION         "?"
+  STAR             "*"
+  PLUS             "+"
+NEWLINE
+
+%token <string Positions.located>
+   LID              "lident"
+   REGEX            "regex"
+   QID              "\"alias\""
+
+%token
+   COLONCOLONEQUAL  "::="
+
+%start <Syntax.partial_grammar> grammar
+%type <Syntax.myfactor>  factor
+%type <Syntax.myfactor>  alternation
+%type <Syntax.myfactor>  modifier
+%type <Syntax.myfactor>  complexterms
+%type <Syntax.myfactor>  term
+%type <Syntax.myfactor>  fstar
+%type <Syntax.myfactor>  sterm
+%type <Syntax.myfactor>  char_class
+%type <string Positions.located>  lid
+%type <string Positions.located>  qid
+%type <Syntax.myfactor>  termfactor
+
+%%
+
+rules:
+    | rules NEWLINE+ rule
+                       {
+		         (print_endline (Batteries.dump ("DEBUG:OLDRULE1",$3)));
+                         Rule $3
+		       }
+
+     |  NEWLINE+ rule    {
+                     (print_endline (Batteries.dump ("DEBUG:OLDRULE",$1)));
+                     Rule $2
+} 
+     |rule  {
+(print_endline (Batteries.dump ("DEBUG:OLDRULE",$1)));
+Rule $1
+} 
+
+grammar:
+  rs =  rules postlude
+    {
+      (print_endline (Batteries.dump ("DEBUG:grammar",rs, $2)));
+      {
+        pg_filename          = ""; (* filled in by the caller *)
+        pg_rules             = [];
+      }
+    }
+
+rule:
+symbol = LID
+/* the symbol that is being defined */
+COLONCOLONEQUAL
+branches = rhs
+    {
+      (print_endline (Batteries.dump ("DEBUG:rule", symbol, branches)));
+      {
+        pr_nt          = Positions.value symbol;
+        pr_positions   = [ Positions.position symbol ];
+        pr_branches    =  [] (*Fixme should be brancheS*)
+      }
+    }
+
+    postlude:
+    NEWLINE*
+             EOF
+               { 
+(print_endline (Batteries.dump ("DEBUG:DONE"))) 
+               }
+
+located(X):
+  x = X
+    { with_loc $loc x }
+
+%inline qid:
+  | QID {      (print_endline (Batteries.dump ("DEBUG:quid", $1)));  $1 }
+%inline lid:
+  | LID {      (print_endline (Batteries.dump ("DEBUG:lid", $1)));  $1 }
+
+%inline sterm:
+  | qid {      (print_endline (Batteries.dump ("DEBUG:sterm/quid", $1))); SFactor $1}
+  | lid {      (print_endline (Batteries.dump ("DEBUG:sterm/lid", $1))); SFactor $1}
+
+term:
+  | complexterms {      (print_endline (Batteries.dump ("DEBUG:term/cterms", $1))); NFactor $1}
+  | sterm {      (print_endline (Batteries.dump ("DEBUG:term/sterm", $1))); NFactor $1}
+
+%inline  complexterms: 
+  | group1 {      (print_endline (Batteries.dump ("DEBUG:cterm/group", $1))); NFactor $1}
+  | class1  {      (print_endline (Batteries.dump ("DEBUG:cterm/class", $1))); NFactor $1}
+
+%inline  group1: 
+ | LPAREN NEWLINE* rhs  RPAREN {      (print_endline (Batteries.dump ("DEBUG:rhs", $3))); NFactor $3} 
+
+%inline class1: 
+/* | LBRACE char_class  RBRACE {} */
+  |  char_class   {      (print_endline (Batteries.dump ("DEBUG:class1a", $1))); NFactor $1}
+  |  REGEX {      (print_endline (Batteries.dump ("DEBUG:class", $1))); SFactor $1}
+
+%inline termfactor:
+  | term   {      (print_endline (Batteries.dump ("DEBUG:termfactor", $1))); NFactor $1}
+
+factor:
+  | termfactor modifier {
+		 (* (print_endline (Batteries.dump ("DEBUG:factormod", ($1,$2)))); *)
+		 (* let foo = CFactor ($1, $2) *)
+	(* in foo 
+           (CFactor ($1, $2) )
+         *)
+
+        NFactor $1
+      }
+
+  | termfactor  {
+	(* (print_endline (Batteries.dump ("DEBUG:factor", $1))); *)
+	(* let foo = SFactor $1 in *)
+	(* foo *)
+        NFactor $1
+      }
+
+%inline modifier:
+  | fplus {      (print_endline (Batteries.dump ("DEBUG:mod", $1))); NFactor $1}
+  | fquest {      (print_endline (Batteries.dump ("DEBUG:quest", $1))); NFactor $1}
+  | fstar {      (print_endline (Batteries.dump ("DEBUG:star", $1))); NFactor $1}
+
+%inline fstar:
+   |  STAR {
+          (* (print_endline (Batteries.dump ("DEBUG:star", $1))); *)
+          Star
+        }
+%inline fquest:
+  |  QUESTION {      (print_endline (Batteries.dump ("DEBUG:quest", $1))); Question}
+%inline fplus:
+  | PLUS {      (print_endline (Batteries.dump ("DEBUG:plus", $1))); Plus}
+
+concatenation:
+  | concatenation factor  {      (print_endline (Batteries.dump ("DEBUG:concat1", $1))); NFactor $1}
+  | factor {      (print_endline (Batteries.dump ("DEBUG:concat2", $1))); NFactor $1}
+
+alternation:
+  | alternation BAR NEWLINE* concatenation { NFactor  $1 }
+  | concatenation {      (print_endline (Batteries.dump ("DEBUG:alt", $1))); NFactor $1}
+
+rhs:
+  | alternation {      (print_endline (Batteries.dump ("DEBUG:rhs", $1))); NFactor $1}
+
+
+char_class:
+    CARET char_class1
+    /* { Cset.complement $2 } */
+{   (print_endline (Batteries.dump ("DEBUG:ccrs",$2))) ; NFactor $2}
+  | char_class1
+    /* { $1 } */
+    {   (print_endline (Batteries.dump ("DEBUG:cc2rs",$1))); CharClass }
+;
+char_class1:
+    Tchar DASH Tchar
+    /* { Cset.interval $1 $3 } */
+         {   (print_endline (Batteries.dump ("DEBUG:cc3rs",$1,$2)));  CharInt $1
+                                                                                 (*fixme*)
+         }
+  | char_class1 Tchar
+    /* Cset.singleton $1 */
+    {   (print_endline (Batteries.dump ("DEBUG:cc4rs",$1))); NFactor $1 }
+  | Tchar
+    /* Cset.singleton $1 */
+    {   (print_endline (Batteries.dump ("DEBUG:cc5rs",$1))); CharInt $1 }
+  /* | char_class1 char_class1  CONCAT */
+  /*       { Cset.union $1 $2 } */
+;
+
+%%