Skip to content

Commit

Permalink
better grammar without bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
mike dupont committed Feb 19, 2024
1 parent b551fc2 commit 0832ef7
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 42 deletions.
12 changes: 9 additions & 3 deletions grammar_run.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@

# stop on error
set -e

GRAMMAR=./grammars/ebnf.ebnf
GRAMMAR_C=$(cat $GRAMMAR)

DS=$(date -Iseconds)

Expand All @@ -10,8 +13,11 @@ echo "Create an EBNF grammar. Consider the following chunk. BEGINSRC " > $HPROM
echo " ENDSRC . Please rewrite it to in the EBNF form." > $TPROMPT_NAME

for PROMPT_NAME in data/folder/grammar/sim*.txt;
do echo $PROMPT_NAME;
dune exec bin/simple_grammar.exe -- \
do
echo $PROMPT_NAME;
echo "${GRAMMAR}"
cat "${GRAMMAR}"
dune exec bin/simple_grammar.exe -- \
--llamacpp \
-u "http://localhost:8080" \
-s "data/folder/grammar/out/grammar_1_${DS}" \
Expand Down
113 changes: 74 additions & 39 deletions grammars/ebnf.ebnf
Original file line number Diff line number Diff line change
@@ -1,71 +1,106 @@
# GBNF (GGML BNF) is a format for defining formal grammars to constrain model outputs in llama.cpp.
# Backus-Naur Form (BNF) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
# In GBNF, we define production rules that specify how a non-terminal (rule name) can be replaced with sequences of terminals (characters, specifically Unicode code points) and other non-terminals. The basic format of a production rule is nonterminal ::= sequence....

production_rule ::= alternation
lhs ::= identifier
rule ::= lhs S "=" S production_rule S | comment
root ::= ( S rule S ) *

# Terminals support the full range of Unicode. Unicode characters can be specified directly in the grammar, for example hiragana ::= [ぁ-ゟ], or with escapes: 8-bit (\xXX), 16-bit (\uXXXX) or 32-bit (\UXXXXXXXX).

range ::= "-"
factor_range ::= term S range S term

# Character ranges can be negated with ^:
negate ::= "^"

#Sequences and Alternatives
#The order of symbols in a sequence matter. For example, in "1. " move " " move "\n", the "1. " must come before the first move, etc.
concatenation ::= ( S factor S ? ) +
negate ::= "^"

# Alternatives, denoted by |, give different sequences that are acceptable.
alternation ::= "|"
alternation ::= ( S concatenation S alternation ? ) +

alternationsymbol ::= "|"


#Sequences and Alternatives

# Parentheses () can be used to group sequences, which allows for embedding alternatives in a larger rule or applying repetition and optional symbols (below) to a sequence.
parens_open ::= "("
parens_close ::= ")"
parens ::= parens_open | parens_close

parensopen ::= "("

parensclose ::= ")"

parens ::= parensopen | parensclose

#Repetition and Optional Symbols
repetition_symbols ::= repetition_plus | repetition_star | repetition_optional

repetitionsymbols ::= repetitionplus | repetitionstar | repetitionoptional

#* after a symbol or sequence means that it can be repeated zero or more times.
repetition_star ::= "*"

repetitionstar ::= "*"

#+ denotes that the symbol or sequence should appear one or more times.
repetition_plus ::= "+"

repetitionplus ::= "+"

#? makes the preceding symbol or sequence optional.
repetition_optional ::= "?"

repetitionoptional ::= "?"


#Comments and newlines
#Comments can be specified with #:

comment ::= "#" [a-zA-Z0-9 \t]*

# Newlines are allowed between rules and between symbols or sequences nested inside parentheses. Additionally, a newline after an alternate marker | will continue the current rule, even outside of parentheses.


letter ::= [a-zA-Z]

digit ::= [0-9]
S ::= ( " " | "\n" | "\t" | "\r" )
braces_open ::= "["
braces_close ::= "]"
braces_symbol ::= braces_open | braces_close

spacechar ::= " "
S ::= ( spacechar | "\n" | "\t" | "\r" )

bracesopen ::= "["

bracesclose ::= "]"

bracessymbol ::= bracesopen | bracesclose

quote ::= "\""

assignment ::= "::="

symbol ::= braces_symbol | parens | quotes |assignment | alternation | range | repetition_symbols | negate

character ::= letter | digit | symbol | "_" | " "
identifier ::= letter ( letter | digit | "_" )*

underscorecharacter ::= "_"

character ::= letter | digit | symbol | underscorecharacter | spacechar

symbol ::= bracessymbol | parens | quote | assignment | alternationsymbol | range | repetitionsymbols | negate



# no underscores in these name

identifier ::= letter ( letter | digit )*

terminal ::= quote character+ quote
group ::= parens_open S production_rule S parens_close
range_term ::= braces_open S production_rule S braces_close
term ::= group |range_term | terminal | identifier

repetition ::= term S occurence
factor_negate ::= negate S factor
factor ::= repetition |factor_range | term S
group ::= parensopen S productionrule S parensclose

rangeterm ::= bracesopen S productionrule S bracesclose

term ::= group |rangeterm | terminal | identifier

repetition ::= term S repetitionsymbols

factornegate ::= negate S factor

factorrange ::= term S range S term

factor ::= repetition |factorrange | term S

#The order of symbols in a sequence matter.

concatenation ::= ( S factor S ? ) +

alternation ::= ( S concatenation S alternationsymbol ? ) +

productionrule ::= alternation


lhs ::= identifier

rule ::= lhs S "=" S productionrule S | comment

root ::= ( S rule S ) *

0 comments on commit 0832ef7

Please sign in to comment.