问题
I would like to know, if this is a bug or behavior, that is intended by the inventor.
Here I have a minimal example of a dypgen grammar:
{
open Parse_tree
let dyp_merge = Dyp.keep_all
}
%start main
%layout [' ' '\t']
%%
main:
| a "\n" { $1 }
a:
| ms b { Mt ($1,$2) }
| b <Mt(_,_)> kon1 b { Koo ($1, $2, $3) }
| b <Mt(_,_)> kon2 b { Koo ($1, $2, $3) }
| b { $1 }
b:
| k { $1 }
| ns b { Nt ($1,$2) }
/* If you comment this line out, it will work with the permutation, but I need the 'n' ! */
/* | b <Nt(_,_)> kon1 b { Koo ($1, $2, $3) }
| b <Nt(_,_)> kon2 b { Koo ($1, $2, $3) }*/
k:
| k kon1 k { Koo ($1, $2, $3) }
| k kon2 k { Koo ($1, $2, $3) }
| ks { $1 }
ms:
| words <M(_)> { $1 }
ns:
| words <N(_)> { $1 }
ks:
| words <K(_)> { $1 }
kon1:
| words <U(_)> { $1 }
kon2:
| 'y' { Y($1) }
words:
| chain { $1 }
throw_away:
| word "|" throw_away { $3 }
| word { $1 }
chain:
| word "|" throw_away { $1 }
| word "|" chain { $3 }
| word { $1 }
word:
| ('m' ['1'-'9']?) { M ($1) }
| ('n' ['1'-'9']?) { N ($1) }
| ('K' ['1'-'9']?) { K ($1) }
| ('u' ['1'-'9']?) { U ($1) }
The example can handle such grammars:
Think about the ? and * as regular expression operators, and 's' and 'm' and 'K' as lexems.
s = m? n* K
the 'K', 'm' and 'n' can also be replaced by these letters and a following number between 1-9 or they can be replaced by lists delimited by '|' as
m1
n1|n2
K|K|K or K1|K2|K3
and these lists could also be mixed as
m1|n1|K1
all these lists are parsed as possible ambiguities, that are globally merged – in the known sense for dypgen – with
let dyp_merge = Dyp.keep_all
If you type in:
m1|n1|K1 m1|n1|K1 m1|n1|K1
you get the results:
m1 > n1 > K1 n1 > n1 > K1
If you type in
K1|K2
you get
K1 K2
Now the interesting point: In the grammar there is another feature. There is a "koordination binding" in the style of natural languages with 'u' or with 'y'.
This can bind these lists of "phrases" (a 'K' letter with optional fronting 'm' and a optinal number of 'n's) to somethin like "K1 and K2". The grammer can parse:
K1|K2 u K3|K4
K1|K2 y K3|K4
And as I thought, it should have the same result. But the difference between the "koordination bindings" is: lexem 'u' is defined as a list of ambiguities in the same ways as m, n, K and could also be mixed with 'K's, 'm's, 'n's lexem 'y' is defined without this list festure.
And this makes a (surprising) difference:
K1|K2 u K3|K4
is parsed as:
koo { K1 u K4 } koo { K2 u K4 }
and
K1|K2 y K3|K4
is parsed as:
koo2 { K1 y K3 } koo2 { K2 y K3 } koo2 { K1 y K4 } koo2 { K2 y K4 }
In the first case the second part of the u-coordination is not permutated. In the second case the second part of the coordination is permutated (as dypgen does it with ambiguities normally).
Why this makes a difference?
(It must be somehow connected to the m's and n's, because if the rules for 'n's are left out, it works.)
Best regards and thank you for thinking about
gwf
The minimal example is in the style of the dypgen-demos, to try, make a folder "abc" in the demos and put all the mentioned, fully cited files in there. The "parse_tree":
type tree =
| M of string
| Mt of tree * tree
| N of string
| Nt of tree * tree
| K of string
| U of string
| Y of string
| Koo of tree * tree * tree
| Koo2 of tree * tree * tree * tree
A file "printit.ml": open Parse_tree
let print_abc abc=
let rec aux1 t = match t with
| Koo(x1, k, x2) -> (
print_string "\x1b[1m\x1b[31mkoo {\x1b[21m\027[0m ";
aux1 x1;
print_string "";
aux1 k;
print_string "";
aux1 x2;
print_string "\x1b[1m\x1b[31m}\x1b[21m\027[0m")
| Koo2(k1, x1, k2, x2) -> (
print_string "\x1b[1m\x1b[31mkoo2 {\x1b[21m\027[0m ";
aux1 k1;
print_string " ";
aux1 x1;
print_string "";
aux1 k2;
print_string "";
aux1 x2;
print_string "\x1b[1m\x1b[31m}\x1b[21m\027[0m")
| Word (w) -> print_string (w ^ " ")
| M (w) -> print_string (w ^ " ")
| K (w) -> print_string (w ^ " ")
| N (w) -> print_string (w ^ " ")
| U (w) -> print_string (w ^ " ")
| Y (w) -> print_string (w ^ " ")
| Nt (p, l)
| Mt (p, l) -> (
print_string "";
aux1 p;
print_string " > ";
aux1 l;)
in
let aux2 t = aux1 t; print_newline () in
List.iter aux2 abc
and the "main" program: open Parse_tree open Printit
let () = print_endline "
please try:
K1|K2 u K3|K4
and
K1|K2 y K3|K4
"
let lexbuf = Dyp.from_channel (Abc_parser.pp ()) stdin
let _ =
try
while true do
(Dyp.flush_input lexbuf;
try
let pf = Abc_parser.main lexbuf in
print_abc (List.map (fun (x,_) -> x) pf)
with
Dyp.Syntax_error -> Printf.printf "Syntax error\n\n"
);
flush stdout
done
with Failure _ -> exit 0
and the "Makefile"
SOURCES = printit.ml abc_parser.dyp abc.ml
REP = -I ../../dyplib
CAMLC = ocamlc $(REP)
DYPGEN = ../../dypgen/dypgen --ocamlc "-I ../../dyplib"
LIBS=dyp.cma
all: abc
SOURCES1 = $(SOURCES:.mll=.ml)
SOURCES2 = $(SOURCES1:.dyp=.ml)
OBJS = $(SOURCES2:.ml=.cmo)
abc: parse_tree.cmi $(OBJS)
$(CAMLC) -o abc $(LIBS) $(OBJS)
.SUFFIXES: .ml .mli .cmo .cmi .dyp
.ml.cmo:
$(CAMLC) -c $<
.mli.cmi:
$(CAMLC) -c $<
.dyp.ml:
$(DYPGEN) $<
$(CAMLC) -c $*.mli
clean:
rm -f *.cm[iox] *~ .*~ *.o
rm -f abc
rm -f *.extract_type *_temp.ml
rm -f *parser.ml *parser.mli
回答1:
I use dypgen but not the ambiguity handling.
A "merge point" is a point in the input stream where two parses of the same nonterminal are completed. If at this point the AST constructed by your action code is identical, you can safely discard either one of the two parses: there may be two ways to parse the non-terminal but the result is the same for both.
If the result is different, by default dypgen just throws one out unless you tell it to keep all the alternatives (which you have).
I'm not fully sure I understand your grammar, however there is a tricky thing in your grammar that may explain your problem.
Dypgen is GLR but it does NOT do proper GLR. If you have a recursion like
x = A x | A
y = y B | B
dypgen does tail and head optimisation and converts the recursion to a loop. You have that in your "throwaway". A real LR parser can only handle left recursion and would barf on right recursion. Dypgen handles both.
In a backtracking parser, if you have something like A*A as a grammar, it first fails on the trailing A because the leading A* ate all the A's in the input up, so it backtracks. GLR doesn't backtrack, it forks a new parse instead. But it cannot do that if it has tail or head optimised the recursion to a loop.
I suspect this is related to your problem. If you try to parse A*A* on AAAAA input, it should give 6 possible parses.
来源:https://stackoverflow.com/questions/39510092/is-this-handling-of-ambiguities-in-dypgen-normal-or-is-it-not