1
Fork 0
simple-sql-parser/Language/SQL/SimpleSQL/Parser.lhs

2107 lines
59 KiB
Plaintext
Raw Normal View History

2013-12-31 10:44:10 +01:00
Notes about the parser:
The lexers appear at the bottom of the file. There tries to be a clear
separation between the lexers and the other parser which only use the
lexers, this isn't 100% complete at the moment and needs fixing.
Left factoring:
The parsing code is aggressively left factored, and try is avoided as
much as possible. Use of try often makes the code hard to follow, so
this has helped the readability of the code a bit. More importantly,
debugging the parser and generating good parse error messages is aided
greatly by left factoring. Apparently it can also help the speed but
this hasn't been looked into.
Error messages:
A lot of care has been given to generating good error messages. There
are a few utils below which partially help in this area. There is also
a plan to write a really simple expression parser which doesn't do
precedence and associativity, and the fix these with a pass over the
ast. I don't think there is any other way to sanely handle the common
prefixes between many infix and postfix multiple keyword operators,
and some other ambiguities also. This should help a lot in generating
good error message also.
There is a set of crafted bad expressions in ErrorMessages.lhs, these
are used to guage the quality of the error messages and monitor
regressions by hand. The use of <?> is limited as much as possible,
since unthinking liberal sprinkling of it seems to make the error
messages much worse, and also has a similar problem to gratuitous use
of try - you can't easily tell which appearances are important and
which aren't.
Both the left factoring and error message work are greatly complicated
by the large number of shared prefixes of the various elements in SQL
syntax.
2014-04-19 12:10:46 +02:00
TOC:
Names - parsing identifiers
Typenames
Value expressions
Simple literals
star, param
parens expression, row constructor and scalar subquery
case, cast, exists, unique, array/ multiset constructor
typed literal, app, special function, aggregate, window function
suffixes: in, between, quantified comparison, match, array subscript,
escape, collate
operators
value expression top level
helpers
> {-# LANGUAGE TupleSections #-}
> -- | This is the module with the parser functions.
2013-12-13 15:04:48 +01:00
> module Language.SQL.SimpleSQL.Parser
> (parseQueryExpr
> ,parseValueExpr
> ,parseQueryExprs
2013-12-13 18:21:44 +01:00
> ,ParseError(..)) where
2013-12-13 11:39:26 +01:00
> import Control.Monad.Identity (Identity)
> import Control.Monad (guard, void, when)
> import Control.Applicative ((<$), (<$>), (<*>) ,(<*), (*>))
> import Data.Maybe (fromMaybe,catMaybes)
> import Data.Char (toLower)
> import Text.Parsec (errorPos,sourceLine,sourceColumn,sourceName
> ,setPosition,setSourceColumn,setSourceLine,getPosition
> ,option,between,sepBy,sepBy1,string,manyTill,anyChar
> ,try,string,many1,oneOf,digit,(<|>),choice,char,eof
> ,optionMaybe,optional,many,letter,parse
> ,chainl1, (<?>),notFollowedBy,alphaNum, lookAhead)
> import Text.Parsec.String (Parser)
> import qualified Text.Parsec as P (ParseError)
> import Text.Parsec.Perm (permute,(<$?>), (<|?>))
> import qualified Text.Parsec.Expr as E
> import Data.List (intercalate,sort,groupBy)
> import Data.Function (on)
2013-12-13 15:04:48 +01:00
> import Language.SQL.SimpleSQL.Syntax
2013-12-13 11:39:26 +01:00
= Public API
2013-12-13 11:39:26 +01:00
2013-12-14 09:55:44 +01:00
> -- | Parses a query expr, trailing semicolon optional.
2013-12-16 09:03:46 +01:00
> parseQueryExpr :: FilePath
> -- ^ filename to use in errors
> -> Maybe (Int,Int)
2013-12-31 11:20:07 +01:00
> -- ^ line number and column number of the first character
> -- in the source (to use in errors)
2013-12-16 09:03:46 +01:00
> -> String
> -- ^ the SQL source to parse
2013-12-13 18:21:44 +01:00
> -> Either ParseError QueryExpr
2013-12-14 09:55:44 +01:00
> parseQueryExpr = wrapParse topLevelQueryExpr
2013-12-13 11:39:26 +01:00
2013-12-31 11:20:07 +01:00
> -- | Parses a list of query expressions, with semi colons between
2013-12-16 09:03:46 +01:00
> -- them. The final semicolon is optional.
> parseQueryExprs :: FilePath
> -- ^ filename to use in errors
> -> Maybe (Int,Int)
2013-12-31 11:20:07 +01:00
> -- ^ line number and column number of the first character
> -- in the source (to use in errors)
2013-12-16 09:03:46 +01:00
> -> String
> -- ^ the SQL source to parse
> -> Either ParseError [QueryExpr]
2013-12-14 09:55:44 +01:00
> parseQueryExprs = wrapParse queryExprs
> -- | Parses a value expression.
> parseValueExpr :: FilePath
2013-12-16 09:03:46 +01:00
> -- ^ filename to use in errors
> -> Maybe (Int,Int)
2013-12-31 11:20:07 +01:00
> -- ^ line number and column number of the first character
> -- in the source (to use in errors)
2013-12-16 09:03:46 +01:00
> -> String
> -- ^ the SQL source to parse
> -> Either ParseError ValueExpr
> parseValueExpr = wrapParse valueExpr
2013-12-14 09:55:44 +01:00
This helper function takes the parser given and:
sets the position when parsing
automatically skips leading whitespace
checks the parser parses all the input using eof
converts the error return to the nice wrapper
> wrapParse :: Parser a
2013-12-14 09:55:44 +01:00
> -> FilePath
> -> Maybe (Int,Int)
> -> String
> -> Either ParseError a
> wrapParse parser f p src =
2013-12-13 18:21:44 +01:00
> either (Left . convParseError src) Right
> $ parse (setPos p *> whitespace *> parser <* eof) f src
2013-12-13 18:21:44 +01:00
2013-12-14 09:55:44 +01:00
> -- | Type to represent parse errors.
2013-12-13 18:21:44 +01:00
> data ParseError = ParseError
2013-12-16 09:03:46 +01:00
> {peErrorString :: String
> -- ^ contains the error message
> ,peFilename :: FilePath
> -- ^ filename location for the error
> ,pePosition :: (Int,Int)
> -- ^ line number and column number location for the error
> ,peFormattedError :: String
> -- ^ formatted error with the position, error
> -- message and source context
2013-12-13 19:01:57 +01:00
> } deriving (Eq,Show)
2013-12-14 00:01:07 +01:00
2013-12-14 00:14:23 +01:00
------------------------------------------------
2013-12-13 18:21:44 +01:00
= Names
Names represent identifiers and a few other things. The parser here
handles regular identifiers, dotten chain identifiers, quoted
identifiers and unicode quoted identifiers.
Dots: dots in identifier chains are parsed here and represented in the
Iden constructor usually. If parts of the chains are non identifier
value expressions, then this is represented by a BinOp "."
instead. Dotten chain identifiers which appear in other contexts (such
as function names, table names, are represented as [Name] only.
Identifier grammar:
unquoted:
underscore <|> letter : many (underscore <|> alphanum
example
_example123
quoted:
double quote, many (non quote character or two double quotes
together), double quote
"example quoted"
"example with "" quote"
unicode quoted is the same as quoted in this parser, except it starts
with U& or u&
u&"example quoted"
> name :: Parser Name
> name = choice [QName <$> quotedIdentifier
> ,UQName <$> uquotedIdentifier
> ,Name <$> identifierBlacklist blacklist]
> names :: Parser [Name]
> names = ((:[]) <$> name) >>= optionSuffix another
> where
> another n =
> (((n++) . (:[])) <$> try (symbol "." *> name)) >>= optionSuffix another
= Type Names
Typenames are used in casts, and also in the typed literal syntax,
which is a typename followed by a string literal.
Here are the grammar notes:
== simple type name
just an identifier chain or a multi word identifier (this is a fixed
list of possibilities, e.g. as 'character varying', see below in the
parser code for the exact list).
<simple-type-name> ::= <identifier-chain>
| multiword-type-identifier
== Precision type name
<precision-type-name> ::= <simple-type-name> <left paren> <unsigned-int> <right paren>
e.g. char(5)
note: above and below every where a simple type name can appear, this
means a single identifier/quoted or a dotted chain, or a multi word
identifier
== Precision scale type name
<precision-type-name> ::= <simple-type-name> <left paren> <unsigned-int> <comma> <unsigned-int> <right paren>
e.g. decimal(15,2)
== Lob type name
this is a variation on the precision type name with some extra info on
the units:
<lob-type-name> ::=
<simple-type-name> <left paren> <unsigned integer> [ <multiplier> ] [ <char length units> ] <right paren>
<multiplier> ::= K | M | G
<char length units> ::= CHARACTERS | CODE_UNITS | OCTETS
(if both multiplier and char length units are missing, then this will
parse as a precision type name)
e.g.
clob(5M octets)
== char type name
this is a simple type with optional precision which allows the
character set or the collation to appear as a suffix:
<char type name> ::=
<simple type name>
[ <left paren> <unsigned-int> <right paren> ]
[ CHARACTER SET <identifier chain> ]
[ COLLATE <identifier chain> ]
e.g.
char(5) character set my_charset collate my_collation
= Time typename
this is typename with optional precision and either 'with time zone'
or 'without time zone' suffix, e.g.:
<datetime type> ::=
[ <left paren> <unsigned-int> <right paren> ]
<with or without time zone>
<with or without time zone> ::= WITH TIME ZONE | WITHOUT TIME ZONE
WITH TIME ZONE | WITHOUT TIME ZONE
= row type name
<row type> ::=
ROW <left paren> <field definition> [ { <comma> <field definition> }... ] <right paren>
<field definition> ::= <identifier> <type name>
e.g.
row(a int, b char(5))
= interval type name
<interval type> ::= INTERVAL <interval datetime field> [TO <interval datetime field>]
<interval datetime field> ::=
<datetime field> [ <left paren> <unsigned int> [ <comma> <unsigned int> ] <right paren> ]
= array type name
<array type> ::= <data type> ARRAY [ <left bracket> <unsigned integer> <right bracket> ]
= multiset type name
<multiset type> ::= <data type> MULTISET
A type name will parse into the 'smallest' constructor it will fit in
syntactically, e.g. a clob(5) will parse to a precision type name, not
a lob type name.
TODO: this code needs heavy refactoring
> typeName :: Parser TypeName
> typeName =
> (rowTypeName <|> intervalTypeName <|> ref <|> otherTypeName)
> >>= tnSuffix
> <?> "typename"
> where
> -- row type names - a little like create table
> rowTypeName =
> RowTypeName <$> (keyword_ "row" *> parens (commaSep1 rowField))
> rowField = (,) <$> name <*> typeName
> -- interval type names: interval a [to b]
> intervalTypeName =
> keyword_ "interval" >>
> uncurry IntervalTypeName <$> intervalQualifier
> ref =
> keyword_ "ref" >>
> RefTypeName
> <$> parens (names)
> <*> optionMaybe (keyword_ "scope" *> names)
> -- other type names, which includes:
> -- precision, scale, lob scale and units, timezone, character
> -- set and collations
> otherTypeName = do
> tn <- (try reservedTypeNames <|> names)
> choice [try $ timezone tn
> ,try (precscale tn) >>= optionSuffix charSuffix
> ,try $ lob tn
> ,optionSuffix charSuffix $ TypeName tn]
> timezone tn = do
> TimeTypeName tn
> <$> optionMaybe prec
> <*> choice [True <$ keywords_ ["with", "time","zone"]
> ,False <$ keywords_ ["without", "time","zone"]]
> charSuffix (PrecTypeName t p) = chars t (Just p)
> charSuffix (TypeName t) = chars t Nothing
> charSuffix _ = fail ""
> chars tn p =
> ((,) <$> option [] charSet
> <*> option [] tcollate)
> >>= uncurry mkit
> where
> mkit [] [] = fail ""
> mkit a b = return $ CharTypeName tn p a b
> lob tn = parens $ do
> (x,y) <- lobPrecToken
> z <- optionMaybe lobUnits
> return $ LobTypeName tn x y z
> precscale tn = parens (commaSep unsignedInteger) >>= makeWrap
> where
> makeWrap [a] = return $ PrecTypeName tn a
> makeWrap [a,b] = return $ PrecScaleTypeName tn a b
> makeWrap _ = fail "there must be one or two precision components"
> prec = parens unsignedInteger
> charSet = keywords_ ["character", "set"] *> names
> tcollate = keyword_ "collate" *> names
> lobPrecToken = lexeme $ do
> p <- read <$> many1 digit <?> "unsigned integer"
> x <- choice [Just LobK <$ keyword_ "k"
> ,Just LobM <$ keyword_ "m"
> ,Just LobG <$ keyword_ "g"
> ,return Nothing]
> return (p,x)
> lobUnits = choice [LobCharacters <$ keyword_ "characters"
> ,LobCodeUnits <$ keyword_ "code_units"
> ,LobOctets <$ keyword_ "octets"]
> -- deal with multiset and array suffixes
> tnSuffix x =
> multisetSuffix x <|> arraySuffix x <|> return x
> multisetSuffix x =
> (MultisetTypeName x <$ keyword_ "multiset") >>= tnSuffix
> arraySuffix x =
> (keyword_ "array" >> ArrayTypeName x
> <$> optionMaybe (brackets unsignedInteger)
> ) >>= tnSuffix
> -- this parser handles the fixed set of multi word
> -- type names, plus all the type names which are
> -- reserved words
> reservedTypeNames = (:[]) . Name . unwords <$> makeKeywordTree
> ["double precision"
> ,"character varying"
> ,"char varying"
> ,"character large object"
> ,"char large object"
> ,"national character"
> ,"national char"
> ,"national character varying"
> ,"national char varying"
> ,"national character large object"
> ,"nchar large object"
> ,"nchar varying"
> ,"bit varying"
> ,"binary large object"
> -- reserved keyword typenames:
> ,"array"
> ,"bigint"
> ,"binary"
> ,"blob"
> ,"boolean"
> ,"char"
> ,"character"
> ,"clob"
> ,"date"
> ,"dec"
> ,"decimal"
> ,"double"
> ,"float"
> ,"int"
> ,"integer"
> ,"nchar"
> ,"nclob"
> ,"numeric"
> ,"real"
> ,"smallint"
> ,"time"
> ,"timestamp"
> ,"varchar"
> ]
= Value expressions
2013-12-13 11:39:26 +01:00
2014-04-19 12:10:46 +02:00
== simple literals
2013-12-14 09:55:44 +01:00
See the stringToken lexer below for notes on string literal syntax.
> stringValue :: Parser ValueExpr
> stringValue = StringLit <$> stringToken
> number :: Parser ValueExpr
2013-12-14 09:55:44 +01:00
> number = NumLit <$> numberLiteral
> characterSetLiteral :: Parser ValueExpr
> characterSetLiteral =
> CSStringLit <$> shortCSPrefix <*> stringToken
> where
> shortCSPrefix =
> choice
> [(:[]) <$> oneOf "nNbBxX"
> ,string "u&"
> ,string "U&"
2014-04-18 11:28:05 +02:00
> ] <* lookAhead quote
2014-04-19 12:10:46 +02:00
TODO: remove try and relocate some
> literal :: Parser ValueExpr
2014-04-18 18:49:00 +02:00
> literal = number <|> stringValue <|> try interval <|> try characterSetLiteral
2014-04-19 12:10:46 +02:00
== star, param, host param
=== star
2013-12-14 09:55:44 +01:00
2013-12-17 11:24:37 +01:00
used in select *, select x.*, and agg(*) variations, and some other
2013-12-17 14:21:43 +01:00
places as well. Because it is quite general, the parser doesn't
2013-12-17 21:15:19 +01:00
attempt to check that the star is in a valid context, it parses it OK
in any value expression context.
2013-12-14 09:55:44 +01:00
> star :: Parser ValueExpr
2013-12-17 14:21:43 +01:00
> star = Star <$ symbol "*"
2013-12-13 11:39:26 +01:00
== parameter
2014-04-19 12:10:46 +02:00
unnamed parameter
use in e.g. select * from t where a = ?
> parameter :: Parser ValueExpr
> parameter = Parameter <$ questionMark
2014-04-17 18:27:18 +02:00
named parameter:
select x from t where x > :param
> hostParameter :: Parser ValueExpr
> hostParameter =
> HostParameter
> <$> hostParameterToken
> <*> optionMaybe (keyword "indicator" *> hostParameterToken)
2014-04-19 12:10:46 +02:00
== parens
2013-12-13 11:39:26 +01:00
2014-04-19 12:10:46 +02:00
value expression parens, row ctor and scalar subquery
2013-12-14 09:55:44 +01:00
2014-04-19 12:10:46 +02:00
> parensTerm :: Parser ValueExpr
> parensTerm = parens $ choice
> [SubQueryExpr SqSq <$> queryExpr
> ,ctor <$> commaSep1 valueExpr]
> where
> ctor [a] = Parens a
> ctor as = SpecialOp [Name "rowctor"] as
2013-12-14 09:55:44 +01:00
2014-04-19 12:10:46 +02:00
== case, cast, exists, unique, array/multiset constructor, interval
2013-12-14 09:55:44 +01:00
2014-04-19 12:10:46 +02:00
All of these start with a fixed keyword which is reserved.
2014-04-18 11:28:05 +02:00
2014-04-19 12:10:46 +02:00
=== case expression
> caseValue :: Parser ValueExpr
> caseValue =
> Case <$> (keyword_ "case" *> optionMaybe valueExpr)
> <*> many1 whenClause
> <*> optionMaybe elseClause
> <* keyword_ "end"
2013-12-14 12:05:02 +01:00
> where
2014-04-19 12:10:46 +02:00
> whenClause = (,) <$> (keyword_ "when" *> commaSep1 valueExpr)
> <*> (keyword_ "then" *> valueExpr)
> elseClause = keyword_ "else" *> valueExpr
2014-04-19 12:10:46 +02:00
=== cast
2013-12-14 09:55:44 +01:00
2014-04-19 12:10:46 +02:00
cast: cast(expr as type)
2013-12-14 09:55:44 +01:00
2014-04-19 12:10:46 +02:00
> cast :: Parser ValueExpr
> cast = keyword_ "cast" >>
> parens (Cast <$> valueExpr
> <*> (keyword_ "as" *> typeName))
2014-04-19 12:10:46 +02:00
=== exists, unique
subquery expression:
[exists|unique] (queryexpr)
> subquery :: Parser ValueExpr
> subquery = SubQueryExpr <$> sqkw <*> parens queryExpr
2013-12-13 22:31:36 +01:00
> where
2014-04-19 12:10:46 +02:00
> sqkw = choice
> [SqExists <$ keyword_ "exists"
> ,SqUnique <$ keyword_ "unique"]
2013-12-13 22:31:36 +01:00
2014-04-19 12:10:46 +02:00
=== array/multiset constructor
> arrayCtor :: Parser ValueExpr
> arrayCtor = keyword_ "array" >>
> choice
> [ArrayCtor <$> parens queryExpr
> ,Array (Iden [Name "array"]) <$> brackets (commaSep valueExpr)]
As far as I can tell, table(query expr) is just syntax sugar for
multiset(query expr). It must be there for compatibility or something.
> multisetCtor :: Parser ValueExpr
> multisetCtor =
> choice
> [keyword_ "multiset" >>
> choice
> [MultisetQueryCtor <$> parens queryExpr
> ,MultisetCtor <$> brackets (commaSep valueExpr)]
> ,keyword_ "table" >>
> MultisetQueryCtor <$> parens queryExpr]
=== interval
interval literals are a special case and we follow the grammar less
permissively here
parse SQL interval literals, something like
interval '5' day (3)
or
interval '5' month
if the literal looks like this:
interval 'something'
then it is parsed as a regular typed literal. It must have a
interval-datetime-field suffix to parse as an intervallit
> interval :: Parser ValueExpr
> interval = keyword_ "interval" >> do
> s <- optionMaybe $ choice [True <$ symbol_ "+"
> ,False <$ symbol_ "-"]
> lit <- stringToken
> q <- optionMaybe intervalQualifier
> mkIt s lit q
> where
> mkIt Nothing val Nothing = return $ TypedLit (TypeName [Name "interval"]) val
> mkIt s val (Just (a,b)) = return $ IntervalLit s val a b
> mkIt (Just {}) _val Nothing = fail "cannot use sign without interval qualifier"
2014-04-17 20:05:47 +02:00
2014-04-19 12:10:46 +02:00
== typed literal, app, special, aggregate, window, iden
All of these start with identifiers (some of the special functions
start with reserved keywords).
they are all variations on suffixes on the basic identifier parser
The windows is a suffix on the app parser
=== iden prefix term
2014-04-17 20:05:47 +02:00
all the value expressions which start with an identifier
(todo: really put all of them here instead of just some of them)
> idenPrefixTerm :: Parser ValueExpr
> idenPrefixTerm =
> -- todo: work out how to left factor this
> try (TypedLit <$> typeName <*> stringToken)
> <|> (names >>= iden)
> where
> iden n = app n <|> return (Iden n)
2013-12-14 09:55:44 +01:00
2014-04-19 12:10:46 +02:00
typed literal
2013-12-14 09:55:44 +01:00
2014-04-19 12:10:46 +02:00
=== special
2013-12-14 09:55:44 +01:00
These are keyword operators which don't look like normal prefix,
postfix or infix binary operators. They mostly look like function
application but with keywords in the argument list instead of commas
to separate the arguments.
the special op keywords
parse an operator which is
operatorname(firstArg keyword0 arg0 keyword1 arg1 etc.)
> data SpecialOpKFirstArg = SOKNone
> | SOKOptional
> | SOKMandatory
> specialOpK :: String -- name of the operator
> -> SpecialOpKFirstArg -- has a first arg without a keyword
> -> [(String,Bool)] -- the other args with their keywords
> -- and whether they are optional
> -> Parser ValueExpr
> specialOpK opName firstArg kws =
> keyword_ opName >> do
> void openParen
> let pfa = do
> e <- valueExpr
> -- check we haven't parsed the first
> -- keyword as an identifier
> guard (case (e,kws) of
> (Iden [Name i], (k,_):_) | map toLower i == k -> False
> _ -> True)
> return e
> fa <- case firstArg of
> SOKNone -> return Nothing
> SOKOptional -> optionMaybe (try pfa)
> SOKMandatory -> Just <$> pfa
> as <- mapM parseArg kws
> void closeParen
> return $ SpecialOpK [Name opName] fa $ catMaybes as
> where
> parseArg (nm,mand) =
> let p = keyword_ nm >> valueExpr
> in fmap (nm,) <$> if mand
> then Just <$> p
> else optionMaybe (try p)
The actual operators:
EXTRACT( date_part FROM expression )
POSITION( string1 IN string2 )
SUBSTRING(extraction_string FROM starting_position [FOR length]
[COLLATE collation_name])
CONVERT(char_value USING conversion_char_name)
TRANSLATE(char_value USING translation_name)
OVERLAY(string PLACING embedded_string FROM start
[FOR length])
TRIM( [ [{LEADING | TRAILING | BOTH}] [removal_char] FROM ]
target_string
[COLLATE collation_name] )
> specialOpKs :: Parser ValueExpr
> specialOpKs = choice $ map try
> [extract, position, substring, convert, translate, overlay, trim]
2013-12-14 09:55:44 +01:00
> extract :: Parser ValueExpr
> extract = specialOpK "extract" SOKMandatory [("from", True)]
2013-12-13 21:38:43 +01:00
> position :: Parser ValueExpr
> position = specialOpK "position" SOKMandatory [("in", True)]
2013-12-14 09:55:44 +01:00
strictly speaking, the substring must have at least one of from and
for, but the parser doens't enforce this
2013-12-14 09:55:44 +01:00
> substring :: Parser ValueExpr
> substring = specialOpK "substring" SOKMandatory
> [("from", False),("for", False)]
> convert :: Parser ValueExpr
> convert = specialOpK "convert" SOKMandatory [("using", True)]
> translate :: Parser ValueExpr
> translate = specialOpK "translate" SOKMandatory [("using", True)]
> overlay :: Parser ValueExpr
> overlay = specialOpK "overlay" SOKMandatory
> [("placing", True),("from", True),("for", False)]
trim is too different because of the optional char, so a custom parser
the both ' ' is filled in as the default if either parts are missing
in the source
> trim :: Parser ValueExpr
> trim =
> keyword "trim" >>
> parens (mkTrim
> <$> option "both" sides
> <*> option " " stringToken
> <*> (keyword_ "from" *> valueExpr))
> where
> sides = choice ["leading" <$ keyword_ "leading"
> ,"trailing" <$ keyword_ "trailing"
> ,"both" <$ keyword_ "both"]
> mkTrim fa ch fr =
> SpecialOpK [Name "trim"] Nothing
> $ catMaybes [Just (fa,StringLit ch)
> ,Just ("from", fr)]
2014-04-19 12:10:46 +02:00
=== app, aggregate, window
this represents anything which syntactically looks like regular C
function application: an identifier, parens with comma sep value
expression arguments.
The parsing for the aggregate extensions is here as well:
aggregate([all|distinct] args [order by orderitems])
TODO: try to refactor the parser to not allow distinct/all or order by
if there are no value exprs
> aggOrApp :: [Name] -> Parser ValueExpr
> aggOrApp n =
> makeApp n
> <$> parens ((,,) <$> (fromMaybe SQDefault <$> duplicates)
> <*> choice [commaSep valueExpr]
> <*> (optionMaybe orderBy))
> where
> makeApp i (SQDefault,es,Nothing) = App i es
> makeApp i (d,es,od) = AggregateApp i d es (fromMaybe [] od)
> app :: [Name] -> Parser ValueExpr
> app n = aggOrApp n >>= optionSuffix windowSuffix
==== window suffix
parse a window call as a suffix of a regular function call
this looks like this:
functionname(args) over ([partition by ids] [order by orderitems])
No support for explicit frames yet.
The convention in this file is that the 'Suffix', erm, suffix on
parser names means that they have been left factored. These are almost
always used with the optionSuffix combinator.
> windowSuffix :: ValueExpr -> Parser ValueExpr
> windowSuffix (App f es) =
> keyword_ "over"
> *> parens (WindowApp f es
> <$> option [] partitionBy
> <*> option [] orderBy
> <*> optionMaybe frameClause)
> where
> partitionBy = keywords_ ["partition","by"] >> commaSep1 valueExpr
> frameClause =
> mkFrame <$> choice [FrameRows <$ keyword_ "rows"
> ,FrameRange <$ keyword_ "range"]
> <*> frameStartEnd
> frameStartEnd =
> choice
> [keyword_ "between" >>
> mkFrameBetween <$> frameLimit True
> <*> (keyword_ "and" *> frameLimit True)
> ,mkFrameFrom <$> frameLimit False]
> -- use the bexpression style from the between parsing for frame between
> frameLimit useB =
> choice
> [Current <$ keywords_ ["current", "row"]
> -- todo: create an automatic left factor for stuff like
> -- this
> ,keyword_ "unbounded" >>
> choice [UnboundedPreceding <$ keyword_ "preceding"
> ,UnboundedFollowing <$ keyword_ "following"]
> ,do
> e <- if useB then valueExprB else valueExpr
> choice [Preceding e <$ keyword_ "preceding"
> ,Following e <$ keyword_ "following"]
> ]
> mkFrameBetween s e rs = FrameBetween rs s e
> mkFrameFrom s rs = FrameFrom rs s
> mkFrame rs c = c rs
> windowSuffix _ = fail ""
== suffixes
These are all generic suffixes on any value expr
=== in
2013-12-14 09:55:44 +01:00
in: two variations:
a in (expr0, expr1, ...)
a in (queryexpr)
this is parsed as a postfix operator which is why it is in this form
> inSuffix :: Parser (ValueExpr -> ValueExpr)
> inSuffix =
> mkIn <$> inty
> <*> parens (choice
> [InQueryExpr <$> queryExpr
> ,InList <$> commaSep1 valueExpr])
> where
> inty = choice [True <$ keyword_ "in"
2014-04-18 11:28:05 +02:00
> ,False <$ keywords_ ["not","in"]]
> mkIn i v = \e -> In i e v
2014-04-19 12:10:46 +02:00
=== between
2013-12-14 09:55:44 +01:00
between:
expr between expr and expr
There is a complication when parsing between - when parsing the second
expression it is ambiguous when you hit an 'and' whether it is a
binary operator or part of the between. This code follows what
postgres does, which might be standard across SQL implementations,
which is that you can't have a binary and operator in the middle
expression in a between unless it is wrapped in parens. The 'bExpr
parsing' is used to create alternative value expression parser which
2013-12-14 09:55:44 +01:00
is identical to the normal one expect it doesn't recognise the binary
and operator. This is the call to valueExprB.
2013-12-14 09:55:44 +01:00
> betweenSuffix :: Parser (ValueExpr -> ValueExpr)
> betweenSuffix =
> makeOp <$> Name <$> opName
> <*> valueExprB
> <*> (keyword_ "and" *> valueExprB)
2013-12-13 20:13:36 +01:00
> where
> opName = choice
2013-12-13 20:13:36 +01:00
> ["between" <$ keyword_ "between"
2014-04-18 11:28:05 +02:00
> ,"not between" <$ try (keywords_ ["not","between"])]
> makeOp n b c = \a -> SpecialOp [n] [a,b,c]
2013-12-13 20:13:36 +01:00
2014-04-19 12:10:46 +02:00
=== quantified comparison
a = any (select * from t)
> quantifiedComparison :: Parser (ValueExpr -> ValueExpr)
> quantifiedComparison = do
> c <- comp
> cq <- compQuan
> q <- parens queryExpr
> return $ \v -> QuantifiedComparison v [c] cq q
> where
> comp = Name <$> choice (map symbol
> ["=", "<>", "<=", "<", ">", ">="])
> compQuan = choice
> [CPAny <$ keyword_ "any"
> ,CPSome <$ keyword_ "some"
> ,CPAll <$ keyword_ "all"]
2014-04-19 12:10:46 +02:00
=== match
a match (select a from t)
> matchPredicate :: Parser (ValueExpr -> ValueExpr)
> matchPredicate = do
> keyword_ "match"
> u <- option False (True <$ keyword_ "unique")
> q <- parens queryExpr
> return $ \v -> Match v u q
2014-04-19 12:10:46 +02:00
=== array subscript
> arrayPostfix :: Parser (ValueExpr -> ValueExpr)
> arrayPostfix = do
> es <- brackets (commaSep valueExpr)
> return $ \v -> Array v es
2014-04-19 12:10:46 +02:00
=== escape
2014-04-18 19:50:24 +02:00
> escape :: Parser (ValueExpr -> ValueExpr)
> escape = do
> ctor <- choice
> [Escape <$ keyword_ "escape"
> ,UEscape <$ keyword_ "uescape"]
> c <- anyChar
> return $ \v -> ctor v c
2014-04-19 12:10:46 +02:00
=== collate
> collate :: Parser (ValueExpr -> ValueExpr)
> collate = do
> keyword_ "collate"
> i <- names
> return $ \v -> Collate v i
2013-12-14 09:55:44 +01:00
2014-04-19 12:10:46 +02:00
== operators
2013-12-14 09:55:44 +01:00
The 'regular' operators in this parsing and in the abstract syntax are
unary prefix, unary postfix and binary infix operators. The operators
can be symbols (a + b), single keywords (a and b) or multiple keywords
(a is similar to b).
2013-12-31 11:20:07 +01:00
TODO: carefully review the precedences and associativities.
TODO: to fix the parsing completely, I think will need to parse
without precedence and associativity and fix up afterwards, since SQL
syntax is way too messy. It might be possible to avoid this if we
wanted to avoid extensibility and to not be concerned with parse error
2014-04-19 12:10:46 +02:00
messages, but both of these are too important.
> opTable :: Bool -> [[E.Operator String () Identity ValueExpr]]
> opTable bExpr =
> [-- parse match and quantified comparisons as postfix ops
> -- todo: left factor the quantified comparison with regular
> -- binary comparison, somehow
> [E.Postfix $ try quantifiedComparison
> ,E.Postfix matchPredicate
> ]
> ,[binarySym "." E.AssocLeft]
> ,[postfix' arrayPostfix
> ,postfix' escape
> ,postfix' collate]
> ,[prefixSym "+", prefixSym "-"]
> ,[binarySym "^" E.AssocLeft]
> ,[binarySym "*" E.AssocLeft
> ,binarySym "/" E.AssocLeft
> ,binarySym "%" E.AssocLeft]
> ,[binarySym "+" E.AssocLeft
> ,binarySym "-" E.AssocLeft]
> ,[binarySym ">=" E.AssocNone
> ,binarySym "<=" E.AssocNone
> ,binarySym "!=" E.AssocRight
> ,binarySym "<>" E.AssocRight
> ,binarySym "||" E.AssocRight
> ,prefixSym "~"
> ,binarySym "&" E.AssocRight
> ,binarySym "|" E.AssocRight
> ,binaryKeyword "like" E.AssocNone
> ,binaryKeyword "overlaps" E.AssocNone]
> ++ [binaryKeywords $ makeKeywordTree
> ["not like"
> ,"is similar to"
> ,"is not similar to"
> ,"is distinct from"
> ,"is not distinct from"]
> ,postfixKeywords $ makeKeywordTree
> ["is null"
> ,"is not null"
> ,"is true"
> ,"is not true"
> ,"is false"
> ,"is not false"
> ,"is unknown"
> ,"is not unknown"]
> ]
2014-04-18 19:50:24 +02:00
> ++ [multisetBinOp]
> -- have to use try with inSuffix because of a conflict
> -- with 'in' in position function, and not between
> -- between also has a try in it to deal with 'not'
> -- ambiguity
> ++ [E.Postfix $ try inSuffix,E.Postfix betweenSuffix]
> ]
> ++
> [[binarySym "<" E.AssocNone
> ,binarySym ">" E.AssocNone]
> ,[binarySym "=" E.AssocRight]
> ,[prefixKeyword "not"]]
> ++
> if bExpr then [] else [[binaryKeyword "and" E.AssocLeft]]
> ++
> [[binaryKeyword "or" E.AssocLeft]]
> where
> binarySym nm assoc = binary (symbol_ nm) nm assoc
> binaryKeyword nm assoc = binary (keyword_ nm) nm assoc
> binaryKeywords p =
> E.Infix (do
> o <- try p
> return (\a b -> BinOp a [Name $ unwords o] b))
> E.AssocNone
> postfixKeywords p =
> postfix' $ do
> o <- try p
> return $ PostfixOp [Name $ unwords o]
> binary p nm assoc =
> E.Infix (p >> return (\a b -> BinOp a [Name nm] b)) assoc
2014-04-18 19:50:24 +02:00
> multisetBinOp = E.Infix (do
> keyword_ "multiset"
> o <- choice [Union <$ keyword_ "union"
> ,Intersect <$ keyword_ "intersect"
> ,Except <$ keyword_ "except"]
> d <- fromMaybe SQDefault <$> duplicates
> return (\a b -> MultisetBinOp a o d b))
> E.AssocLeft
> prefixKeyword nm = prefix (keyword_ nm) nm
> prefixSym nm = prefix (symbol_ nm) nm
> prefix p nm = prefix' (p >> return (PrefixOp [Name nm]))
> -- hack from here
> -- http://stackoverflow.com/questions/10475337/parsec-expr-repeated-prefix-postfix-operator-not-supported
> -- not implemented properly yet
> -- I don't think this will be enough for all cases
> -- at least it works for 'not not a'
> -- ok: "x is not true is not true"
> -- no work: "x is not true is not null"
> prefix' p = E.Prefix . chainl1 p $ return (.)
> postfix' p = E.Postfix . chainl1 p $ return (flip (.))
2013-12-13 11:39:26 +01:00
2014-04-19 12:10:46 +02:00
== value expression top level
This parses most of the value exprs.The order of the parsers and use
of try is carefully done to make everything work. It is a little
fragile and could at least do with some heavy explanation.
2013-12-13 11:39:26 +01:00
> valueExpr :: Parser ValueExpr
> valueExpr = E.buildExpressionParser (opTable False) term
> term :: Parser ValueExpr
> term = choice [literal
> ,parameter
2014-04-17 18:27:18 +02:00
> ,hostParameter
2014-04-19 12:10:46 +02:00
> ,star
> ,parensTerm
> ,caseValue
> ,cast
> ,arrayCtor
2014-04-18 19:50:24 +02:00
> ,multisetCtor
> ,subquery
2014-04-19 12:10:46 +02:00
> ,specialOpKs
2014-04-17 20:05:47 +02:00
> ,idenPrefixTerm]
> <?> "value expression"
2013-12-13 11:39:26 +01:00
expose the b expression for window frame clause range between
> valueExprB :: Parser ValueExpr
> valueExprB = E.buildExpressionParser (opTable True) term
2014-04-19 12:10:46 +02:00
== helper parsers
> intervalQualifier :: Parser (IntervalTypeField,Maybe IntervalTypeField)
> intervalQualifier =
> (,) <$> intervalField
> <*> optionMaybe (keyword_ "to" *> intervalField)
> where
> intervalField =
> Itf
> <$> datetimeField
> <*> optionMaybe
> (parens ((,) <$> unsignedInteger
> <*> optionMaybe (comma *> unsignedInteger)))
TODO: use this in extract
use a data type for the datetime field?
> datetimeField :: Parser String
> datetimeField = choice (map keyword ["year","month","day"
> ,"hour","minute","second"])
> <?> "datetime field"
> duplicates :: Parser (Maybe SetQuantifier)
> duplicates = optionMaybe $
> choice [All <$ keyword_ "all"
> ,Distinct <$ keyword "distinct"]
2013-12-13 11:39:26 +01:00
-------------------------------------------------
= query expressions
2013-12-14 09:55:44 +01:00
== select lists
2013-12-13 16:27:02 +01:00
> selectItem :: Parser (ValueExpr,Maybe Name)
> selectItem = (,) <$> valueExpr <*> optionMaybe als
> where als = optional (keyword_ "as") *> name
2013-12-13 11:39:26 +01:00
> selectList :: Parser [(ValueExpr,Maybe Name)]
2013-12-13 16:27:02 +01:00
> selectList = commaSep1 selectItem
2013-12-13 11:39:26 +01:00
2013-12-14 09:55:44 +01:00
== from
2013-12-14 12:05:02 +01:00
Here is the rough grammar for joins
2013-12-14 09:55:44 +01:00
tref
2013-12-14 12:05:02 +01:00
(cross | [natural] ([inner] | (left | right | full) [outer])) join
tref
2013-12-14 09:55:44 +01:00
[on expr | using (...)]
> from :: Parser [TableRef]
> from = keyword_ "from" *> commaSep1 tref
2013-12-13 11:39:26 +01:00
> where
2013-12-14 12:05:02 +01:00
> tref = nonJoinTref >>= optionSuffix joinTrefSuffix
> nonJoinTref = choice
> [parens $ choice
> [TRQueryExpr <$> queryExpr
> ,TRParens <$> tref]
> ,TRLateral <$> (keyword_ "lateral"
> *> nonJoinTref)
> ,do
> n <- names
> choice [TRFunction n
> <$> parens (commaSep valueExpr)
> ,return $ TRSimple n]]
> >>= optionSuffix aliasSuffix
> aliasSuffix j = option j (TRAlias j <$> alias)
> joinTrefSuffix t =
> (TRJoin t <$> option False (True <$ keyword_ "natural")
> <*> joinType
2013-12-14 13:10:46 +01:00
> <*> nonJoinTref
> <*> optionMaybe joinCondition)
2013-12-14 12:05:02 +01:00
> >>= optionSuffix joinTrefSuffix
2014-04-18 11:28:05 +02:00
TODO: factor the join stuff to produce better error messages
> joinType :: Parser JoinType
> joinType = choice
> [JCross <$ keyword_ "cross" <* keyword_ "join"
> ,JInner <$ keyword_ "inner" <* keyword_ "join"
> ,JLeft <$ keyword_ "left"
> <* optional (keyword_ "outer")
> <* keyword_ "join"
> ,JRight <$ keyword_ "right"
> <* optional (keyword_ "outer")
> <* keyword_ "join"
> ,JFull <$ keyword_ "full"
> <* optional (keyword_ "outer")
> <* keyword_ "join"
> ,JInner <$ keyword_ "join"]
> joinCondition :: Parser JoinCondition
> joinCondition =
> choice [keyword_ "on" >> JoinOn <$> valueExpr
> ,keyword_ "using" >> JoinUsing <$> parens (commaSep1 name)
> ]
2013-12-13 11:39:26 +01:00
> alias :: Parser Alias
> alias = Alias <$> tableAlias <*> columnAliases
> where
> tableAlias = optional (keyword_ "as") *> name
> columnAliases = optionMaybe $ parens $ commaSep1 name
2013-12-14 09:55:44 +01:00
== simple other parts
Parsers for where, group by, having, order by and limit, which are
pretty trivial.
> whereClause :: Parser ValueExpr
> whereClause = keyword_ "where" *> valueExpr
2013-12-13 11:39:26 +01:00
> groupByClause :: Parser [GroupingExpr]
2014-04-18 11:28:05 +02:00
> groupByClause = keywords_ ["group","by"]
2013-12-17 18:27:09 +01:00
> *> commaSep1 groupingExpression
> where
> groupingExpression =
> choice
> [keyword_ "cube" >>
2013-12-17 18:27:09 +01:00
> Cube <$> parens (commaSep groupingExpression)
> ,keyword_ "rollup" >>
2013-12-17 18:27:09 +01:00
> Rollup <$> parens (commaSep groupingExpression)
> ,GroupingParens <$> parens (commaSep groupingExpression)
2014-04-18 11:28:05 +02:00
> ,keywords_ ["grouping", "sets"] >>
2013-12-17 18:27:09 +01:00
> GroupingSets <$> parens (commaSep groupingExpression)
> ,SimpleGroup <$> valueExpr
2013-12-17 18:27:09 +01:00
> ]
2013-12-13 11:39:26 +01:00
> having :: Parser ValueExpr
> having = keyword_ "having" *> valueExpr
2013-12-13 11:39:26 +01:00
> orderBy :: Parser [SortSpec]
2014-04-18 11:28:05 +02:00
> orderBy = keywords_ ["order","by"] *> commaSep1 ob
2013-12-13 16:08:10 +01:00
> where
> ob = SortSpec
> <$> valueExpr
> <*> option DirDefault (choice [Asc <$ keyword_ "asc"
> ,Desc <$ keyword_ "desc"])
2013-12-17 17:28:31 +01:00
> <*> option NullsOrderDefault
2014-04-18 11:28:05 +02:00
> -- todo: left factor better
> (keyword_ "nulls" >>
2013-12-17 17:28:31 +01:00
> choice [NullsFirst <$ keyword "first"
> ,NullsLast <$ keyword "last"])
2013-12-13 11:39:26 +01:00
allows offset and fetch in either order
+ postgresql offset without row(s) and limit instead of fetch also
> offsetFetch :: Parser (Maybe ValueExpr, Maybe ValueExpr)
> offsetFetch = permute ((,) <$?> (Nothing, Just <$> offset)
> <|?> (Nothing, Just <$> fetch))
2013-12-13 16:27:02 +01:00
> offset :: Parser ValueExpr
> offset = keyword_ "offset" *> valueExpr
> <* option () (choice [keyword_ "rows"
> ,keyword_ "row"])
> fetch :: Parser ValueExpr
> fetch = choice [ansiFetch, limit]
2014-04-18 22:51:05 +02:00
> where
> fs = makeKeywordTree ["fetch first", "fetch next"]
> ro = makeKeywordTree ["rows only", "row only"]
> ansiFetch = fs *> valueExpr <* ro
> limit = keyword_ "limit" *> valueExpr
2013-12-13 16:27:02 +01:00
2013-12-14 09:55:44 +01:00
== common table expressions
> with :: Parser QueryExpr
> with = keyword_ "with" >>
> With <$> option False (True <$ keyword_ "recursive")
> <*> commaSep1 withQuery <*> queryExpr
> where
2013-12-14 00:14:23 +01:00
> withQuery =
> (,) <$> (alias <* keyword_ "as")
2013-12-14 00:14:23 +01:00
> <*> parens queryExpr
2013-12-13 16:27:02 +01:00
2013-12-14 09:55:44 +01:00
== query expression
This parser parses any query expression variant: normal select, cte,
and union, etc..
> queryExpr :: Parser QueryExpr
2013-12-13 11:39:26 +01:00
> queryExpr =
> choice [with
2013-12-17 12:58:44 +01:00
> ,choice [values,table, select]
> >>= optionSuffix queryExprSuffix]
> where
> select = keyword_ "select" >>
> mkSelect
> <$> (fromMaybe SQDefault <$> duplicates)
> <*> selectList
> <*> optionMaybe tableExpression
> mkSelect d sl Nothing =
> makeSelect{qeSetQuantifier = d, qeSelectList = sl}
> mkSelect d sl (Just (TableExpression f w g h od ofs fe)) =
> Select d sl f w g h od ofs fe
> values = keyword_ "values"
> >> Values <$> commaSep (parens (commaSep valueExpr))
> table = keyword_ "table" >> Table <$> names
local data type to help with parsing the bit after the select list,
called 'table expression' in the ansi sql grammar. Maybe this should
be in the public syntax?
> data TableExpression
> = TableExpression
> {_teFrom :: [TableRef]
> ,_teWhere :: Maybe ValueExpr
> ,_teGroupBy :: [GroupingExpr]
> ,_teHaving :: Maybe ValueExpr
> ,_teOrderBy :: [SortSpec]
> ,_teOffset :: Maybe ValueExpr
> ,_teFetchFirst :: Maybe ValueExpr}
> tableExpression :: Parser TableExpression
> tableExpression =
> mkTe <$> from
> <*> optionMaybe whereClause
> <*> option [] groupByClause
> <*> optionMaybe having
> <*> option [] orderBy
> <*> offsetFetch
> where
> mkTe f w g h od (ofs,fe) =
> TableExpression f w g h od ofs fe
2013-12-13 22:41:12 +01:00
> queryExprSuffix :: QueryExpr -> Parser QueryExpr
2013-12-13 22:41:12 +01:00
> queryExprSuffix qe =
> (CombineQueryExpr qe
> <$> (choice
> [Union <$ keyword_ "union"
> ,Intersect <$ keyword_ "intersect"
> ,Except <$ keyword_ "except"] <?> "set operator")
> <*> (fromMaybe SQDefault <$> duplicates)
> <*> option Respectively
> (Corresponding <$ keyword_ "corresponding")
> <*> queryExpr)
> >>= optionSuffix queryExprSuffix
2013-12-13 11:39:26 +01:00
2013-12-14 09:55:44 +01:00
wrapper for query expr which ignores optional trailing semicolon.
> topLevelQueryExpr :: Parser QueryExpr
2013-12-14 09:55:44 +01:00
> topLevelQueryExpr =
> queryExpr >>= optionSuffix ((semi *>) . return)
2013-12-14 09:55:44 +01:00
wrapper to parse a series of query exprs from a single source. They
must be separated by semicolon, but for the last expression, the
trailing semicolon is optional.
> queryExprs :: Parser [QueryExpr]
2013-12-14 16:09:45 +01:00
> queryExprs =
> (:[]) <$> queryExpr
> >>= optionSuffix ((semi *>) . return)
2013-12-14 10:59:29 +01:00
> >>= optionSuffix (\p -> (p++) <$> queryExprs)
----------------------------------------------
= multi keyword helper
This helper is to help parsing multiple options of multiple keywords
with similar prefixes, e.g. parsing 'is null' and 'is not null'.
use to left factor/ improve:
typed literal and general identifiers
not like, not in, not between operators
help with factoring keyword functions and other app-likes
the join keyword sequences
fetch first/next
row/rows only
There is probably a simpler way of doing this but I am a bit
thick.
> makeKeywordTree :: [String] -> Parser [String]
> makeKeywordTree sets =
> parseTrees (sort $ map words sets)
> -- ?? <?> intercalate "," sets
> where
> parseTrees :: [[String]] -> Parser [String]
> parseTrees ws = do
> let gs :: [[[String]]]
> gs = groupBy ((==) `on` safeHead) ws
> choice $ map parseGroup gs
> parseGroup :: [[String]] -> Parser [String]
> parseGroup l@((k:_):_) = do
> keyword_ k
> let tls = catMaybes $ map safeTail l
> pr = (k:) <$> parseTrees tls
> if (or $ map null tls)
> then pr <|> return [k]
> else pr
> parseGroup _ = guard False >> error "impossible"
> safeHead (x:_) = Just x
> safeHead [] = Nothing
> safeTail (_:x) = Just x
> safeTail [] = Nothing
2013-12-13 11:39:26 +01:00
------------------------------------------------
2013-12-14 09:55:44 +01:00
= lexing parsers
whitespace parser which skips comments also
> whitespace :: Parser ()
> whitespace =
> choice [simpleWhitespace *> whitespace
> ,lineComment *> whitespace
> ,blockComment *> whitespace
> ,return ()] <?> "whitespace"
2013-12-14 09:55:44 +01:00
> where
> lineComment = try (string "--")
> *> manyTill anyChar (void (char '\n') <|> eof)
> blockComment = -- no nesting of block comments in SQL
> try (string "/*")
> -- try used here so it doesn't fail when we see a
> -- '*' which isn't followed by a '/'
> *> manyTill anyChar (try $ string "*/")
> -- use many1 so we can more easily avoid non terminating loops
> simpleWhitespace = void $ many1 (oneOf " \t\n")
2013-12-17 12:21:36 +01:00
> lexeme :: Parser a -> Parser a
> lexeme p = p <* whitespace
2013-12-17 12:21:36 +01:00
> unsignedInteger :: Parser Integer
> unsignedInteger = read <$> lexeme (many1 digit) <?> "integer"
2013-12-14 09:55:44 +01:00
number literals
here is the rough grammar target:
digits
digits.[digits][e[+-]digits]
[digits].digits[e[+-]digits]
digitse[+-]digits
2013-12-14 16:09:45 +01:00
numbers are parsed to strings, not to a numeric type. This is to avoid
2013-12-14 09:55:44 +01:00
making a decision on how to represent numbers, the client code can
make this choice.
> numberLiteral :: Parser String
> numberLiteral = lexeme (
> (choice [int
2013-12-14 09:55:44 +01:00
> >>= optionSuffix dot
> >>= optionSuffix fracts
> >>= optionSuffix expon
> ,fract "" >>= optionSuffix expon])
> <* notFollowedBy (alphaNum <|> char '.'))
> <?> "number literal"
2013-12-14 09:55:44 +01:00
> where
> int = many1 digit
> fract p = dot p >>= fracts
2013-12-14 10:47:13 +01:00
> dot p = (p++) <$> string "."
2013-12-14 09:55:44 +01:00
> fracts p = (p++) <$> int
2013-12-14 10:47:13 +01:00
> expon p = concat <$> sequence
> [return p
> ,(:[]) <$> oneOf "eE"
2013-12-14 10:47:13 +01:00
> ,option "" (string "+" <|> string "-")
> ,int]
2013-12-14 09:55:44 +01:00
> identifier :: Parser String
> identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar)
> <?> "identifier"
> where
2014-04-18 11:28:05 +02:00
> firstChar = letter <|> char '_' <?> "identifier"
> nonFirstChar = digit <|> firstChar <?> ""
2013-12-14 09:55:44 +01:00
> quotedIdentifier :: Parser String
> quotedIdentifier = quotedIdenHelper
2013-12-13 11:39:26 +01:00
> quotedIdenHelper :: Parser String
> quotedIdenHelper =
> lexeme (dq *> manyTill anyChar dq >>= optionSuffix moreIden)
> <?> "identifier"
> where
> moreIden s0 = do
> void dq
> s <- manyTill anyChar dq
> optionSuffix moreIden (s0 ++ "\"" ++ s)
> dq = char '"' <?> "double quote"
> uquotedIdentifier :: Parser String
> uquotedIdentifier =
> try (string "u&" <|> string "U&") *> quotedIdenHelper
> <?> "identifier"
2014-04-17 18:27:18 +02:00
parses an identifier with a : prefix. The : isn't included in the
return value
> hostParameterToken :: Parser String
> hostParameterToken = lexeme $ char ':' *> identifier
todo: work out the symbol parsing better
> symbol :: String -> Parser String
> symbol s = try (lexeme $ do
> u <- choice (many1 (char '.') :
> map (try . string) [">=","<=","!=","<>","||"]
> ++ map (string . (:[])) "+-^*/%~&|<>=")
> guard (s == u)
> return s)
> <?> s
> questionMark :: Parser Char
2014-04-18 11:28:05 +02:00
> questionMark = lexeme (char '?') <?> "question mark"
> openParen :: Parser Char
> openParen = lexeme $ char '('
> closeParen :: Parser Char
> closeParen = lexeme $ char ')'
> openBracket :: Parser Char
> openBracket = lexeme $ char '['
> closeBracket :: Parser Char
> closeBracket = lexeme $ char ']'
> comma :: Parser Char
2014-04-18 11:28:05 +02:00
> comma = lexeme (char ',') <?> "comma"
> semi :: Parser Char
2014-04-18 11:28:05 +02:00
> semi = lexeme (char ';') <?> "semicolon"
2014-04-18 11:28:05 +02:00
> quote :: Parser Char
> quote = lexeme (char '\'') <?> "single quote"
> --stringToken :: Parser String
> --stringToken = lexeme (char '\'' *> manyTill anyChar (char '\''))
> -- todo: tidy this up, add the prefixes stuff, and add the multiple
> -- string stuff
> stringToken :: Parser String
> stringToken =
2014-04-18 11:28:05 +02:00
> lexeme (nlquote *> manyTill anyChar nlquote
> >>= optionSuffix moreString)
> <?> "string"
2013-12-13 11:39:26 +01:00
> where
> moreString s0 = choice
> [-- handle two adjacent quotes
> do
2014-04-18 11:28:05 +02:00
> void nlquote
> s <- manyTill anyChar nlquote
> optionSuffix moreString (s0 ++ "'" ++ s)
> ,-- handle string in separate parts
> -- e.g. 'part 1' 'part 2'
2014-04-18 22:51:05 +02:00
> do --can this whitespace be factored out?
2014-04-18 11:28:05 +02:00
> try (whitespace <* nlquote)
> s <- manyTill anyChar nlquote
> optionSuffix moreString (s0 ++ s)
> ]
2014-04-18 11:28:05 +02:00
> -- non lexeme quote
> nlquote = char '\'' <?> "single quote"
2013-12-13 11:39:26 +01:00
= helper functions
> keyword :: String -> Parser String
> keyword k = try (do
> i <- identifier
> guard (map toLower i == k)
> return k) <?> k
2014-04-18 11:28:05 +02:00
helper function to improve error messages
> keywords_ :: [String] -> Parser ()
> keywords_ ks = mapM_ keyword_ ks <?> intercalate " " ks
> parens :: Parser a -> Parser a
> parens = between openParen closeParen
> brackets :: Parser a -> Parser a
> brackets = between openBracket closeBracket
> commaSep :: Parser a -> Parser [a]
> commaSep = (`sepBy` comma)
> keyword_ :: String -> Parser ()
> keyword_ = void . keyword
> symbol_ :: String -> Parser ()
> symbol_ = void . symbol
> commaSep1 :: Parser a -> Parser [a]
> commaSep1 = (`sepBy1` comma)
2013-12-14 09:55:44 +01:00
2013-12-14 12:05:02 +01:00
a possible issue with the option suffix is that it enforces left
associativity when chaining it recursively. Have to review
all these uses and figure out if any should be right associative
instead, and create an alternative suffix parser
> optionSuffix :: (a -> Parser a) -> a -> Parser a
2013-12-13 11:39:26 +01:00
> optionSuffix p a = option a (p a)
> identifierBlacklist :: [String] -> Parser String
> identifierBlacklist bl = try (do
> i <- identifier
> when (map toLower i `elem` bl) $
> fail $ "keyword not allowed here: " ++ i
> return i)
> <?> "identifier"
2013-12-13 11:39:26 +01:00
> blacklist :: [String]
> blacklist = reservedWord {-
> [-- case
> "case", "when", "then", "else", "end"
> ,--join
> "natural","inner","outer","cross","left","right","full","join"
> ,"on","using","lateral"
> ,"from","where","group","having","order","limit", "offset", "fetch"
> ,"as","in"
> ,"except", "intersect", "union"
> ] -}
2013-12-13 11:39:26 +01:00
These blacklisted names are mostly needed when we parse something with
an optional alias, e.g. select a a from t. If we write select a from
t, we have to make sure the from isn't parsed as an alias. I'm not
sure what other places strictly need the blacklist, and in theory it
could be tuned differently for each place the identifierString/
identifier parsers are used to only blacklist the bare minimum.
2013-12-14 00:14:23 +01:00
The standard has a weird mix of reserved keywords and unreserved
keywords (I'm not sure what exactly being an unreserved keyword
means).
> _nonReservedWord :: [String]
> _nonReservedWord =
> ["a"
> ,"abs"
> ,"absolute"
> ,"action"
> ,"ada"
> ,"admin"
> ,"after"
> ,"always"
> ,"asc"
> ,"assertion"
> ,"assignment"
> ,"attribute"
> ,"attributes"
> ,"avg"
> ,"before"
> ,"bernoulli"
> ,"breadth"
> ,"c"
> ,"cardinality"
> ,"cascade"
> ,"catalog"
> ,"catalog_name"
> ,"ceil"
> ,"ceiling"
> ,"chain"
> ,"characteristics"
> ,"characters"
> ,"character_length"
> ,"character_set_catalog"
> ,"character_set_name"
> ,"character_set_schema"
> ,"char_length"
> ,"checked"
> ,"class_origin"
> ,"coalesce"
> ,"cobol"
> ,"code_units"
> ,"collation"
> ,"collation_catalog"
> ,"collation_name"
> ,"collation_schema"
> ,"collect"
> ,"column_name"
> ,"command_function"
> ,"command_function_code"
> ,"committed"
> ,"condition"
> ,"condition_number"
> ,"connection_name"
> ,"constraints"
> ,"constraint_catalog"
> ,"constraint_name"
> ,"constraint_schema"
> ,"constructors"
> ,"contains"
> ,"convert"
> ,"corr"
> ,"count"
> ,"covar_pop"
> ,"covar_samp"
> ,"cume_dist"
> ,"current_collation"
> ,"cursor_name"
> ,"data"
> ,"datetime_interval_code"
> ,"datetime_interval_precision"
> ,"defaults"
> ,"deferrable"
> ,"deferred"
> ,"defined"
> ,"definer"
> ,"degree"
> ,"dense_rank"
> ,"depth"
> ,"derived"
> ,"desc"
> ,"descriptor"
> ,"diagnostics"
> ,"dispatch"
> ,"domain"
> ,"dynamic_function"
> ,"dynamic_function_code"
> ,"equals"
> ,"every"
> ,"exception"
> ,"exclude"
> ,"excluding"
> ,"exp"
> ,"extract"
> ,"final"
> ,"first"
> ,"floor"
> ,"following"
> ,"fortran"
> ,"found"
> ,"fusion"
> ,"g"
> ,"general"
> ,"go"
> ,"goto"
> ,"granted"
> ,"hierarchy"
> ,"implementation"
> ,"including"
> ,"increment"
> ,"initially"
> ,"instance"
> ,"instantiable"
> ,"intersection"
> ,"invoker"
> ,"isolation"
> ,"k"
> ,"key"
> ,"key_member"
> ,"key_type"
> ,"last"
> ,"length"
> ,"level"
> ,"ln"
> ,"locator"
> ,"lower"
> ,"m"
> ,"map"
> ,"matched"
> ,"max"
> ,"maxvalue"
> ,"message_length"
> ,"message_octet_length"
> ,"message_text"
> ,"min"
> ,"minvalue"
> ,"mod"
> ,"more"
> ,"mumps"
> ,"name"
> ,"names"
> ,"nesting"
> ,"next"
> ,"normalize"
> ,"normalized"
> ,"nullable"
> ,"nullif"
> ,"nulls"
> ,"number"
> ,"object"
> ,"octets"
> ,"octet_length"
> ,"option"
> ,"options"
> ,"ordering"
> ,"ordinality"
> ,"others"
> ,"overlay"
> ,"overriding"
> ,"pad"
> ,"parameter_mode"
> ,"parameter_name"
> ,"parameter_ordinal_position"
> ,"parameter_specific_catalog"
> ,"parameter_specific_name"
> ,"parameter_specific_schema"
> ,"partial"
> ,"pascal"
> ,"path"
> ,"percentile_cont"
> ,"percentile_disc"
> ,"percent_rank"
> ,"placing"
> ,"pli"
> ,"position"
> ,"power"
> ,"preceding"
> ,"preserve"
> ,"prior"
> ,"privileges"
> ,"public"
> ,"rank"
> ,"read"
> ,"relative"
> ,"repeatable"
> ,"restart"
> ,"returned_cardinality"
> ,"returned_length"
> ,"returned_octet_length"
> ,"returned_sqlstate"
> ,"role"
> ,"routine"
> ,"routine_catalog"
> ,"routine_name"
> ,"routine_schema"
> ,"row_count"
> ,"row_number"
> ,"scale"
> ,"schema"
> ,"schema_name"
> ,"scope_catalog"
> ,"scope_name"
> ,"scope_schema"
> ,"section"
> ,"security"
> ,"self"
> ,"sequence"
> ,"serializable"
> ,"server_name"
> ,"session"
> ,"sets"
> ,"simple"
> ,"size"
> ,"source"
> ,"space"
> ,"specific_name"
> ,"sqrt"
> ,"state"
> ,"statement"
> ,"stddev_pop"
> ,"stddev_samp"
> ,"structure"
> ,"style"
> ,"subclass_origin"
> ,"substring"
> ,"sum"
> ,"tablesample"
> ,"table_name"
> ,"temporary"
> ,"ties"
> ,"top_level_count"
> ,"transaction"
> ,"transactions_committed"
> ,"transactions_rolled_back"
> ,"transaction_active"
> ,"transform"
> ,"transforms"
> ,"translate"
> ,"trigger_catalog"
> ,"trigger_name"
> ,"trigger_schema"
> ,"trim"
> ,"type"
> ,"unbounded"
> ,"uncommitted"
> ,"under"
> ,"unnamed"
> ,"usage"
> ,"user_defined_type_catalog"
> ,"user_defined_type_code"
> ,"user_defined_type_name"
> ,"user_defined_type_schema"
> ,"view"
> ,"work"
> ,"write"
> ,"zone"]
> reservedWord :: [String]
> reservedWord =
> ["add"
> ,"all"
> ,"allocate"
> ,"alter"
> ,"and"
> ,"any"
> ,"are"
> ,"array"
> ,"as"
> ,"asensitive"
> ,"asymmetric"
> ,"at"
> ,"atomic"
> ,"authorization"
> ,"begin"
> ,"between"
> ,"bigint"
> ,"binary"
> ,"blob"
> ,"boolean"
> ,"both"
> ,"by"
> ,"call"
> ,"called"
> ,"cascaded"
> ,"case"
> ,"cast"
> ,"char"
> ,"character"
> ,"check"
> ,"clob"
> ,"close"
> ,"collate"
> ,"column"
> ,"commit"
> ,"connect"
> ,"constraint"
> ,"continue"
> ,"corresponding"
> ,"create"
> ,"cross"
> ,"cube"
> ,"current"
> --,"current_date"
> ,"current_default_transform_group"
> ,"current_path"
> ,"current_role"
> ,"current_time"
> ,"current_timestamp"
> ,"current_transform_group_for_type"
> ,"current_user"
> ,"cursor"
> ,"cycle"
> ,"date"
> --,"day"
> ,"deallocate"
> ,"dec"
> ,"decimal"
> ,"declare"
> --,"default"
> ,"delete"
> ,"deref"
> ,"describe"
> ,"deterministic"
> ,"disconnect"
> ,"distinct"
> ,"double"
> ,"drop"
> ,"dynamic"
> ,"each"
> --,"element"
> ,"else"
> ,"end"
> ,"end-exec"
> ,"escape"
> ,"except"
> ,"exec"
> ,"execute"
> ,"exists"
> ,"external"
> --,"false"
> ,"fetch"
> ,"filter"
> ,"float"
> ,"for"
> ,"foreign"
> ,"free"
> ,"from"
> ,"full"
> ,"function"
> ,"get"
> ,"global"
> ,"grant"
> ,"group"
> ,"grouping"
> ,"having"
> ,"hold"
> --,"hour"
> ,"identity"
> ,"immediate"
> ,"in"
> ,"indicator"
> ,"inner"
> ,"inout"
> ,"input"
> ,"insensitive"
> ,"insert"
> ,"int"
> ,"integer"
> ,"intersect"
> ,"interval"
> ,"into"
> ,"is"
> ,"isolation"
> ,"join"
> ,"language"
> ,"large"
> ,"lateral"
> ,"leading"
> ,"left"
> ,"like"
> ,"local"
> ,"localtime"
> ,"localtimestamp"
> ,"match"
> ,"member"
> ,"merge"
> ,"method"
> --,"minute"
> ,"modifies"
> ,"module"
> --,"month"
> ,"multiset"
> ,"national"
> ,"natural"
> ,"nchar"
> ,"nclob"
> ,"new"
> ,"no"
> ,"none"
> ,"not"
> --,"null"
> ,"numeric"
> ,"of"
> ,"old"
> ,"on"
> ,"only"
> ,"open"
> ,"or"
> ,"order"
> ,"out"
> ,"outer"
> ,"output"
> ,"over"
> ,"overlaps"
> ,"parameter"
> ,"partition"
> ,"precision"
> ,"prepare"
> ,"primary"
> ,"procedure"
> ,"range"
> ,"reads"
> ,"real"
> ,"recursive"
> ,"ref"
> ,"references"
> ,"referencing"
> ,"regr_avgx"
> ,"regr_avgy"
> ,"regr_count"
> ,"regr_intercept"
> ,"regr_r2"
> ,"regr_slope"
> ,"regr_sxx"
> ,"regr_sxy"
> ,"regr_syy"
> ,"release"
> ,"result"
> ,"return"
> ,"returns"
> ,"revoke"
> ,"right"
> ,"rollback"
> ,"rollup"
> --,"row"
> ,"rows"
> ,"savepoint"
> ,"scroll"
> ,"search"
> --,"second"
> ,"select"
> ,"sensitive"
> ,"session_user"
> --,"set"
> ,"similar"
> ,"smallint"
> ,"some"
> ,"specific"
> ,"specifictype"
> ,"sql"
> ,"sqlexception"
> ,"sqlstate"
> ,"sqlwarning"
> --,"start"
> ,"static"
> ,"submultiset"
> ,"symmetric"
> ,"system"
> ,"system_user"
> ,"table"
> ,"then"
> ,"time"
> ,"timestamp"
> ,"timezone_hour"
> ,"timezone_minute"
> ,"to"
> ,"trailing"
> ,"translation"
> ,"treat"
> ,"trigger"
> --,"true"
> ,"uescape"
> ,"union"
> ,"unique"
> --,"unknown"
> ,"unnest"
> ,"update"
> ,"upper"
> ,"user"
> ,"using"
> --,"value"
> ,"values"
> ,"var_pop"
> ,"var_samp"
> ,"varchar"
> ,"varying"
> ,"when"
> ,"whenever"
> ,"where"
> ,"width_bucket"
> ,"window"
> ,"with"
> ,"within"
> ,"without"
> --,"year"
> -- added for this parser
> ,"limit"
> ,"offset"
> ]
2013-12-14 00:14:23 +01:00
--------------------------------------------
= helper functions
> setPos :: Maybe (Int,Int) -> Parser ()
2013-12-14 00:14:23 +01:00
> setPos Nothing = return ()
> setPos (Just (l,c)) = fmap f getPosition >>= setPosition
> where f = flip setSourceColumn c
> . flip setSourceLine l
> convParseError :: String -> P.ParseError -> ParseError
> convParseError src e =
> ParseError
> {peErrorString = show e
> ,peFilename = sourceName p
> ,pePosition = (sourceLine p, sourceColumn p)
> ,peFormattedError = formatError src e
> }
> where
> p = errorPos e
format the error more nicely: emacs format for positioning, plus
context
> formatError :: String -> P.ParseError -> String
> formatError src e =
> sourceName p ++ ":" ++ show (sourceLine p)
> ++ ":" ++ show (sourceColumn p) ++ ":"
> ++ context
> ++ show e
> where
> context =
> let lns = take 1 $ drop (sourceLine p - 1) $ lines src
> in case lns of
> [x] -> "\n" ++ x ++ "\n"
> ++ replicate (sourceColumn p - 1) ' ' ++ "^\n"
> _ -> ""
> p = errorPos e