There are two grammars here so far. The first is a lexer segment that lexes perl regular expressions, for Antlr. The second is a Smalltalk-80 grammar (originally written for PCCTS, not Antlr).
This works as part of an ANTLR lexer. IDENTIFIER, since it starts w/ an ALPHA includes SUBSTITUTION and MATCH in order to resolve lexical ambiguities. This doesn't deal w/ perl's /flags as we didn't allow them in our use of this grammar, but they are easily added without problem.
IDENTIFIER
: (SUBSTITUTION)=>SUBSTITUTION #{$setType(SUBSTITUTION);}
| (MATCH)=>MATCH #{$setType(MATCH);}
| (ALPHA|UNDERSCORE)(ALPHA|DIGIT|UNDERSCORE)+
;
protected
MATCH
{
char c = '\00';
}
:
// Start the match with the normal 'm'
'm'
#{((c=LA(1)) != '\00')}? REGEX_DELIM
INSIDE_REGEX[c]
#{(LA(1) == c)}? REGEX_DELIM
|
'/' INSIDE_REGEX['/'] '/'
;
protected
SUBSTITUTION
{
char c = '\00';
}
:
's'
#{((c=LA(1)) != '\00')}? REGEX_DELIM
INSIDE_REGEX[c]
#{(LA(1) == c)}? REGEX_DELIM
INSIDE_REGEX[c]
#{(LA(1) == c)}? REGEX_DELIM
;
protected
INSIDE_REGEX[char m]
:
(ESC | .)
(#{ ( LA(1) != m )}? INSIDE_REGEX[m] )?
;
protected
REGEX_DELIM
: ~('a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '\00')
;
This is a quick grammar-only output from a PCCTS version of Smalltalk-80. It should be trivial to convert to Antlr (and I will--sooner or later).
parse :
( classDefinition bang )+ "@"
;
classDefinition :
classHeader ( method bang )*
| ( staticStatement )+
;
staticStatement :
className ( keyword sharp className keyword stringConstant keyword stringConstant keyword stringConstant keyword ( identifier | stringConstant ) | keyword stringConstant ) #{ "\." }
;
classHeader :
bang className #{ identifier } keyword stringConstant bang
;
method :
messagePattern #{ temporaries } #{ primitive } statements
;
messagePattern :
unarySelector
| binarySelector variableName
| ( keyword variableName )+
;
temporaries :
verticalBar ( variableName )* verticalBar
;
statements :
#{ nonEmptyStatements }
;
nonEmptyStatements :
uparrow expression #{ "\." }
| expression #{ dot statements }
;
expression :
( variableName assign )? variableName assign expression
| simpleExpression
;
simpleExpression :
primary #{ messageExpression ( semicolon messageElt )* }
;
messageElt :
( unarySelector | binarySelector unaryObjectDescription | ( keyword binaryObjectDescription )+ )
;
messageExpression :
unaryExpression
| binaryExpression
| keywordExpression
;
unaryExpression :
( unarySelector )+ #{ binaryExpression | keywordExpression }
;
binaryExpression :
( binarySelector unaryObjectDescription )+ #{ keywordExpression }
;
keywordExpression :
( keyword binaryObjectDescription )+
;
unaryObjectDescription :
primary ( unarySelector )*
;
binaryObjectDescription :
primary ( unarySelector )* ( binarySelector unaryObjectDescription )*
;
primary :
literal
| variableName
| block
| openParen expression closeParen
;
literal :
numberConstant
| characterConstant
| stringConstant
| sharp ( symbol | array )
;
block :
openBracket #{ ( colon variableName )+ verticalBar } statements closeBracket
;
array :
openParen ( arrayConstantElt )* closeParen
;
arrayConstantElt :
numberConstant
| characterConstant
| stringConstant
| symbol
| array
;
symbol :
( identifier | binarySelector | keyword )
;
unarySelector :
identifier
;
binarySelector :
binaryOperator
| verticalBar
;
type :
openParen className closeParen
;
className :
identifier
;
variableName :
identifier
;
bang :
"!"
;
uparrow :
"^"
;
dot :
"\."
;
assign :
":=|_"
;
semicolon :
";"
;
sharp :
"#"
;
colon :
":"
;
openBracket :
"\["
;
closeBracket :
"\]"
;
openParen :
"\("
;
closeParen :
"\)"
;
verticalBar :
"\|"
;
binaryOperator :
"([/<>%&?,\+\=\@\-\\\*\~])#{[/<>%&?,!\+\=\@\|\-\\\*\~]}"
;
keyword :
KEYWORD
;
identifier :
"[a-zA-Z][a-zA-Z0-9]*"
;
characterConstant :
"\$~[@\n\r\t\ ]"
;
stringConstant :
STRING_LITERAL
;
numberConstant :
"[0-9]+"
;
primitive :
PRIMITIVE
;