1
Fork 0
simple-sql-parser/Language/SQL/SimpleSQL/Parser.lhs

1041 lines
33 KiB
Plaintext
Raw Normal View History

2013-12-13 11:39:26 +01:00
> {-# LANGUAGE TupleSections #-}
> -- | This is the module with the parser functions.
2013-12-13 15:04:48 +01:00
> module Language.SQL.SimpleSQL.Parser
> (parseQueryExpr
> ,parseValueExpr
> ,parseQueryExprs
2013-12-13 18:21:44 +01:00
> ,ParseError(..)) where
2013-12-13 11:39:26 +01:00
> import Control.Monad.Identity
> import Control.Applicative hiding (many, (<|>), optional)
> import Data.Maybe
> import Data.Char
2013-12-14 09:55:44 +01:00
> import Text.Parsec hiding (ParseError)
> import qualified Text.Parsec as P
> import Text.Parsec.Perm
2013-12-13 11:39:26 +01:00
2013-12-13 15:04:48 +01:00
> import Language.SQL.SimpleSQL.Syntax
> import Language.SQL.SimpleSQL.Fixity
2013-12-13 11:39:26 +01:00
2013-12-14 16:09:45 +01:00
The public API functions.
2013-12-13 11:39:26 +01:00
2013-12-14 09:55:44 +01:00
> -- | Parses a query expr, trailing semicolon optional.
2013-12-16 09:03:46 +01:00
> parseQueryExpr :: FilePath
> -- ^ filename to use in errors
> -> Maybe (Int,Int)
> -- ^ line number and column number to use in errors
> -> String
> -- ^ the SQL source to parse
2013-12-13 18:21:44 +01:00
> -> Either ParseError QueryExpr
2013-12-14 09:55:44 +01:00
> parseQueryExpr = wrapParse topLevelQueryExpr
2013-12-13 11:39:26 +01:00
2013-12-16 09:03:46 +01:00
> -- | Parses a list of query exprs, with semi colons between
> -- them. The final semicolon is optional.
> parseQueryExprs :: FilePath
> -- ^ filename to use in errors
> -> Maybe (Int,Int)
> -- ^ line number and column number to use in errors
> -> String
> -- ^ the SQL source to parse
> -> Either ParseError [QueryExpr]
2013-12-14 09:55:44 +01:00
> parseQueryExprs = wrapParse queryExprs
> -- | Parses a value expression.
> parseValueExpr :: FilePath
2013-12-16 09:03:46 +01:00
> -- ^ filename to use in errors
> -> Maybe (Int,Int)
> -- ^ line number and column number to use in errors
> -> String
> -- ^ the SQL source to parse
> -> Either ParseError ValueExpr
> parseValueExpr = wrapParse valueExpr
2013-12-14 09:55:44 +01:00
This helper function takes the parser given and:
sets the position when parsing
automatically skips leading whitespace
checks the parser parses all the input using eof
converts the error return to the nice wrapper
> wrapParse :: P a
> -> FilePath
> -> Maybe (Int,Int)
> -> String
> -> Either ParseError a
> wrapParse parser f p src =
2013-12-13 18:21:44 +01:00
> either (Left . convParseError src) Right
2013-12-14 09:55:44 +01:00
> $ parse (setPos p *> whiteSpace *> parser <* eof) f src
2013-12-13 18:21:44 +01:00
2013-12-14 09:55:44 +01:00
> -- | Type to represent parse errors.
2013-12-13 18:21:44 +01:00
> data ParseError = ParseError
2013-12-16 09:03:46 +01:00
> {peErrorString :: String
> -- ^ contains the error message
> ,peFilename :: FilePath
> -- ^ filename location for the error
> ,pePosition :: (Int,Int)
> -- ^ line number and column number location for the error
> ,peFormattedError :: String
> -- ^ formatted error with the position, error
> -- message and source context
2013-12-13 19:01:57 +01:00
> } deriving (Eq,Show)
2013-12-14 00:01:07 +01:00
2013-12-14 00:14:23 +01:00
------------------------------------------------
2013-12-13 18:21:44 +01:00
2013-12-13 11:39:26 +01:00
> type P a = ParsecT String () Identity a
= value expressions
2013-12-13 11:39:26 +01:00
2013-12-14 09:55:44 +01:00
== literals
See the stringLiteral lexer below for notes on string literal syntax.
> estring :: P ValueExpr
> estring = StringLit <$> stringLiteral
> number :: P ValueExpr
2013-12-14 09:55:44 +01:00
> number = NumLit <$> numberLiteral
parse SQL interval literals, something like
interval '5' day (3)
or
interval '5' month
2013-12-13 11:39:26 +01:00
wrap the whole lot in try, in case we get something like this:
interval '3 days'
which parses as a typed literal
> interval :: P ValueExpr
> interval = try (keyword_ "interval" >>
> IntervalLit
> <$> stringLiteral
> <*> identifierString
> <*> optionMaybe (try $ parens integerLiteral))
2013-12-13 11:39:26 +01:00
> literal :: P ValueExpr
> literal = number <|> estring <|> interval
2013-12-13 11:39:26 +01:00
2013-12-14 09:55:44 +01:00
== identifiers
2013-12-13 11:39:26 +01:00
2013-12-16 09:03:46 +01:00
Uses the identifierString 'lexer'. See this function for notes on
identifiers.
2013-12-13 11:39:26 +01:00
2013-12-17 12:21:36 +01:00
> name :: P Name
> name = choice [QName <$> quotedIdentifier
> ,Name <$> identifierString]
> identifier :: P ValueExpr
2013-12-17 12:21:36 +01:00
> identifier = Iden <$> name
2013-12-13 11:39:26 +01:00
2013-12-14 09:55:44 +01:00
== star
2013-12-17 11:24:37 +01:00
used in select *, select x.*, and agg(*) variations, and some other
2013-12-17 14:21:43 +01:00
places as well. Because it is quite general, the parser doesn't
2013-12-17 21:15:19 +01:00
attempt to check that the star is in a valid context, it parses it OK
in any value expression context.
2013-12-14 09:55:44 +01:00
> star :: P ValueExpr
2013-12-17 14:21:43 +01:00
> star = Star <$ symbol "*"
2013-12-13 11:39:26 +01:00
== parameter
use in e.g. select * from t where a = ?
> parameter :: P ValueExpr
> parameter = Parameter <$ symbol "?"
2013-12-14 09:55:44 +01:00
== function application, aggregates and windows
2013-12-13 11:39:26 +01:00
2013-12-14 09:55:44 +01:00
this represents anything which syntactically looks like regular C
function application: an identifier, parens with comma sep value
2013-12-14 09:55:44 +01:00
expression arguments.
The parsing for the aggregate extensions is here as well:
aggregate([all|distinct] args [order by orderitems])
> aggOrApp :: P ValueExpr
2013-12-14 12:05:02 +01:00
> aggOrApp =
> makeApp
2013-12-17 12:21:36 +01:00
> <$> name
2013-12-14 12:05:02 +01:00
> <*> parens ((,,) <$> try duplicates
> <*> choice [commaSep valueExpr']
2013-12-14 12:05:02 +01:00
> <*> try (optionMaybe orderBy))
> where
> makeApp i (Nothing,es,Nothing) = App i es
> makeApp i (d,es,od) = AggregateApp i d es (fromMaybe [] od)
> duplicates :: P (Maybe SetQuantifier)
2013-12-14 09:55:44 +01:00
> duplicates = optionMaybe $ try $
> choice [All <$ keyword_ "all"
> ,Distinct <$ keyword "distinct"]
parse a window call as a suffix of a regular function call
this looks like this:
2013-12-14 16:09:45 +01:00
functionname(args) over ([partition by ids] [order by orderitems])
2013-12-14 09:55:44 +01:00
No support for explicit frames yet.
The convention in this file is that the 'Suffix', erm, suffix on
parser names means that they have been left factored. These are almost
always used with the optionSuffix combinator.
> windowSuffix :: ValueExpr -> P ValueExpr
> windowSuffix (App f es) =
> try (keyword_ "over")
> *> parens (WindowApp f es
> <$> option [] partitionBy
> <*> option [] orderBy
> <*> optionMaybe frameClause)
2013-12-13 22:31:36 +01:00
> where
> partitionBy = try (keyword_ "partition") >>
> keyword_ "by" >> commaSep1 valueExpr'
> frameClause =
2013-12-17 19:27:11 +01:00
> mkFrame <$> choice [FrameRows <$ keyword_ "rows"
> ,FrameRange <$ keyword_ "range"]
> <*> frameStartEnd
> frameStartEnd =
> choice
> [try (keyword_ "between") >>
> mkFrameBetween <$> frameLimit True
> <*> (keyword_ "and" *> frameLimit True)
> ,mkFrameFrom <$> frameLimit False]
> -- use the bexpression style from the between parsing for frame between
> frameLimit useB =
> choice
> [Current <$ try (keyword_ "current") <* keyword_ "row"
> ,try (keyword_ "unbounded") >>
> choice [UnboundedPreceding <$ keyword_ "preceding"
> ,UnboundedFollowing <$ keyword_ "following"]
> ,do
> e <- if useB then valueExprB else valueExpr
> choice [Preceding e <$ keyword_ "preceding"
> ,Following e <$ keyword_ "following"]
> ]
> mkFrameBetween s e rs = FrameBetween rs s e
> mkFrameFrom s rs = FrameFrom rs s
> mkFrame rs c = c rs
> windowSuffix _ = fail ""
2013-12-13 22:31:36 +01:00
> app :: P ValueExpr
> app = aggOrApp >>= optionSuffix windowSuffix
2013-12-14 09:55:44 +01:00
== case expression
> scase :: P ValueExpr
2013-12-13 11:39:26 +01:00
> scase =
> Case <$> (try (keyword_ "case") *> optionMaybe (try valueExpr'))
2013-12-13 11:39:26 +01:00
> <*> many1 swhen
> <*> optionMaybe (try (keyword_ "else") *> valueExpr')
2013-12-13 11:39:26 +01:00
> <* keyword_ "end"
> where
> swhen = keyword_ "when" *>
> ((,) <$> commaSep1 valueExpr'
> <*> (keyword_ "then" *> valueExpr'))
2013-12-13 11:39:26 +01:00
2013-12-14 09:55:44 +01:00
== miscellaneous keyword operators
These are keyword operators which don't look like normal prefix,
postfix or infix binary operators. They mostly look like function
application but with keywords in the argument list instead of commas
to separate the arguments.
cast: cast(expr as type)
> cast :: P ValueExpr
> cast = parensCast <|> prefixCast
> where
> parensCast = try (keyword_ "cast") >>
> parens (Cast <$> valueExpr'
> <*> (keyword_ "as" *> typeName))
> prefixCast = try (TypedLit <$> typeName
> <*> stringLiteral)
the special op keywords
parse an operator which is
operatorname(firstArg keyword0 arg0 keyword1 arg1 etc.)
> data SpecialOpKFirstArg = SOKNone
> | SOKOptional
> | SOKMandatory
> specialOpK :: String -- name of the operator
> -> SpecialOpKFirstArg -- has a first arg without a keyword
> -> [(String,Bool)] -- the other args with their keywords
> -- and whether they are optional
> -> P ValueExpr
> specialOpK opName firstArg kws =
> keyword_ opName >> do
> void $ symbol "("
> let pfa = do
> e <- valueExpr'
> -- check we haven't parsed the first
> -- keyword as an identifier
> guard (case (e,kws) of
> (Iden (Name i), ((k,_):_)) | map toLower i == k -> False
> _ -> True)
> return e
> fa <- case firstArg of
> SOKNone -> return Nothing
> SOKOptional -> optionMaybe (try pfa)
> SOKMandatory -> Just <$> pfa
> as <- mapM parseArg kws
> void $ symbol ")"
> return $ SpecialOpK (Name opName) fa $ catMaybes as
> where
> parseArg (nm,mand) =
> let p = keyword_ nm >> valueExpr'
> in fmap (nm,) <$> if mand
> then Just <$> p
> else optionMaybe (try p)
The actual operators:
EXTRACT( date_part FROM expression )
POSITION( string1 IN string2 )
SUBSTRING(extraction_string FROM starting_position [FOR length]
[COLLATE collation_name])
CONVERT(char_value USING conversion_char_name)
TRANSLATE(char_value USING translation_name)
OVERLAY(string PLACING embedded_string FROM start
[FOR length])
TRIM( [ [{LEADING | TRAILING | BOTH}] [removal_char] FROM ]
target_string
[COLLATE collation_name] )
> specialOpKs :: P ValueExpr
> specialOpKs = choice $ map try
> [extract, position, substring, convert, translate, overlay, trim]
2013-12-14 09:55:44 +01:00
> extract :: P ValueExpr
> extract = specialOpK "extract" SOKMandatory [("from", True)]
2013-12-13 21:38:43 +01:00
> position :: P ValueExpr
> position = specialOpK "position" SOKMandatory [("in", True)]
2013-12-14 09:55:44 +01:00
strictly speaking, the substring must have at least one of from and
for, but the parser doens't enforce this
2013-12-14 09:55:44 +01:00
> substring :: P ValueExpr
> substring = specialOpK "substring" SOKMandatory
> [("from", False),("for", False),("collate", False)]
> convert :: P ValueExpr
> convert = specialOpK "convert" SOKMandatory [("using", True)]
> translate :: P ValueExpr
> translate = specialOpK "translate" SOKMandatory [("using", True)]
> overlay :: P ValueExpr
> overlay = specialOpK "overlay" SOKMandatory
> [("placing", True),("from", True),("for", False)]
trim is too different because of the optional char, so a custom parser
the both ' ' is filled in as the default if either parts are missing
in the source
> trim :: P ValueExpr
> trim =
> keyword "trim" >>
> parens (mkTrim
> <$> option "both" sides
> <*> option " " stringLiteral
> <*> (keyword_ "from" *> valueExpr')
> <*> optionMaybe (keyword_ "collate" *> stringLiteral))
> where
> sides = choice ["leading" <$ keyword_ "leading"
> ,"trailing" <$ keyword_ "trailing"
> ,"both" <$ keyword_ "both"]
> mkTrim fa ch fr cl =
> SpecialOpK (Name "trim") Nothing
> $ catMaybes [Just (fa,StringLit ch)
> ,Just ("from", fr)
> ,fmap (("collate",) . StringLit) cl]
2013-12-14 09:55:44 +01:00
in: two variations:
a in (expr0, expr1, ...)
a in (queryexpr)
> inSuffix :: ValueExpr -> P ValueExpr
> inSuffix e =
2013-12-14 00:14:23 +01:00
> In <$> inty
> <*> return e
> <*> parens (choice
> [InQueryExpr <$> queryExpr
> ,InList <$> commaSep1 valueExpr'])
> where
> inty = try $ choice [True <$ keyword_ "in"
> ,False <$ keyword_ "not" <* keyword_ "in"]
2013-12-14 09:55:44 +01:00
between:
expr between expr and expr
There is a complication when parsing between - when parsing the second
expression it is ambiguous when you hit an 'and' whether it is a
binary operator or part of the between. This code follows what
postgres does, which might be standard across SQL implementations,
which is that you can't have a binary and operator in the middle
expression in a between unless it is wrapped in parens. The 'bExpr
parsing' is used to create alternative value expression parser which
2013-12-14 09:55:44 +01:00
is identical to the normal one expect it doesn't recognise the binary
and operator. This is the call to valueExpr'' True.
2013-12-14 09:55:44 +01:00
> betweenSuffix :: ValueExpr -> P ValueExpr
2013-12-13 20:13:36 +01:00
> betweenSuffix e =
2013-12-17 12:21:36 +01:00
> makeOp <$> (Name <$> opName)
2013-12-14 00:14:23 +01:00
> <*> return e
> <*> valueExpr'' True
> <*> (keyword_ "and" *> valueExpr'' True)
2013-12-13 20:13:36 +01:00
> where
> opName = try $ choice
> ["between" <$ keyword_ "between"
> ,"not between" <$ keyword_ "not" <* keyword_ "between"]
> makeOp n a b c = SpecialOp n [a,b,c]
2013-12-13 20:13:36 +01:00
2013-12-14 09:55:44 +01:00
subquery expression:
[exists|all|any|some] (queryexpr)
> subquery :: P ValueExpr
> subquery =
> choice
> [try $ SubQueryExpr SqSq <$> parens queryExpr
> ,SubQueryExpr <$> try sqkw <*> parens queryExpr]
> where
> sqkw = try $ choice
> [SqExists <$ keyword_ "exists"
> ,SqAll <$ try (keyword_ "all")
> ,SqAny <$ keyword_ "any"
> ,SqSome <$ keyword_ "some"]
2013-12-14 09:55:44 +01:00
typename: used in casts. Special cases for the multi keyword typenames
that SQL supports.
> typeName :: P TypeName
> typeName = choice (multiWordParsers
> ++ [TypeName <$> identifierString])
> >>= optionSuffix precision
> where
> multiWordParsers =
> flip map multiWordTypeNames
> $ \ks -> (TypeName . unwords) <$> try (mapM keyword ks)
> multiWordTypeNames = map words
> ["double precision"
> ,"character varying"
> ,"char varying"
> ,"character large object"
> ,"char large object"
> ,"national character"
> ,"national char"
> ,"national character varying"
> ,"national char varying"
> ,"national character large object"
> ,"nchar large object"
> ,"nchar varying"
> ,"bit varying"
> ]
todo: timestamp types:
| TIME [ <left paren> <time precision> <right paren> ] [ WITH TIME ZONE ]
| TIMESTAMP [ <left paren> <timestamp precision> <right paren> ] [ WITH TIME ZONE ]
> precision t = try (parens (commaSep integerLiteral)) >>= makeWrap t
> makeWrap (TypeName t) [a] = return $ PrecTypeName t a
> makeWrap (TypeName t) [a,b] = return $ PrecScaleTypeName t a b
> makeWrap _ _ = fail "there must be one or two precision components"
== value expression parens and row ctor
2013-12-14 09:55:44 +01:00
> sparens :: P ValueExpr
> sparens =
> ctor <$> parens (commaSep1 valueExpr')
> where
> ctor [a] = Parens a
> ctor as = SpecialOp (Name "rowctor") as
2013-12-14 09:55:44 +01:00
== operator parsing
The 'regular' operators in this parsing and in the abstract syntax are
unary prefix, unary postfix and binary infix operators. The operators
can be symbols (a + b), single keywords (a and b) or multiple keywords
(a is similar to b).
First, the list of the regulars operators split by operator type
(prefix, postfix, binary) and by symbol/single keyword/ multiple
keyword.
> binOpSymbolNames :: [String]
2013-12-14 00:14:23 +01:00
> binOpSymbolNames =
> ["=", "<=", ">=", "!=", "<>", "<", ">"
2013-12-17 15:15:19 +01:00
> ,"*", "/", "+", "-", "%"
> ,"||", "."
> ,"^", "|", "&"
> ]
> binOpKeywordNames :: [String]
2013-12-14 00:14:23 +01:00
> binOpKeywordNames = ["and", "or", "like", "overlaps"]
> binOpMultiKeywordNames :: [[String]]
> binOpMultiKeywordNames = map words
> ["not like"
> ,"is similar to"
> ,"is not similar to"
> ,"is distinct from"
> ,"is not distinct from"]
2013-12-13 20:13:36 +01:00
used for between parsing
> binOpKeywordNamesNoAnd :: [String]
> binOpKeywordNamesNoAnd = filter (/="and") binOpKeywordNames
2013-12-14 09:55:44 +01:00
There aren't any multi keyword prefix operators currently supported.
> prefixUnOpKeywordNames :: [String]
> prefixUnOpKeywordNames = ["not"]
2013-12-13 19:01:57 +01:00
> prefixUnOpSymbolNames :: [String]
2013-12-17 15:15:19 +01:00
> prefixUnOpSymbolNames = ["+", "-", "~"]
2013-12-13 19:01:57 +01:00
2013-12-14 15:58:35 +01:00
There aren't any single keyword postfix operators currently
supported. Maybe all these 'is's can be left factored?
2013-12-14 09:55:44 +01:00
> postfixOpKeywords :: [String]
> postfixOpKeywords = ["is null"
> ,"is not null"
> ,"is true"
> ,"is not true"
> ,"is false"
> ,"is not false"
> ,"is unknown"
> ,"is not unknown"]
The parsers:
2013-12-13 19:01:57 +01:00
> prefixUnaryOp :: P ValueExpr
> prefixUnaryOp =
> PrefixOp <$> (Name <$> opSymbol) <*> valueExpr'
2013-12-13 19:01:57 +01:00
> where
> opSymbol = choice (map (try . symbol) prefixUnOpSymbolNames
2013-12-14 12:05:02 +01:00
> ++ map (try . keyword) prefixUnOpKeywordNames)
2013-12-14 19:20:41 +01:00
TODO: the handling of multikeyword args is different in
postfixopsuffix and binaryoperatorsuffix. It should be the same in
both cases
> postfixOpSuffix :: ValueExpr -> P ValueExpr
2013-12-14 09:55:44 +01:00
> postfixOpSuffix e =
> try $ choice $ map makeOp opPairs
> where
2013-12-14 09:55:44 +01:00
> opPairs = flip map postfixOpKeywords $ \o -> (o, words o)
2013-12-17 12:21:36 +01:00
> makeOp (o,ws) = try $ PostfixOp (Name o) e <$ keywords_ ws
2013-12-13 21:25:22 +01:00
> keywords_ = try . mapM_ keyword_
2013-12-13 11:39:26 +01:00
All the binary operators are parsed as same precedence and left
2013-12-14 16:09:45 +01:00
associativity. This is fixed with a separate pass over the AST.
2013-12-13 20:13:36 +01:00
> binaryOperatorSuffix :: Bool -> ValueExpr -> P ValueExpr
> binaryOperatorSuffix bExpr e0 =
2013-12-17 12:21:36 +01:00
> BinOp e0 <$> (Name <$> opSymbol) <*> factor
2013-12-13 11:39:26 +01:00
> where
> opSymbol = choice
> (map (try . symbol) binOpSymbolNames
> ++ map (try . keywords) binOpMultiKeywordNames
> ++ map (try . keyword)
> (if bExpr
> then binOpKeywordNamesNoAnd
> else binOpKeywordNames))
2013-12-13 21:25:22 +01:00
> keywords ks = unwords <$> mapM keyword ks
2013-12-13 11:39:26 +01:00
> sqlFixities :: [[Fixity]]
> sqlFixities = highPrec ++ defaultPrec ++ lowPrec
> where
> allOps = binOpSymbolNames ++ binOpKeywordNames
2013-12-14 16:09:45 +01:00
> ++ map unwords binOpMultiKeywordNames
> ++ prefixUnOpKeywordNames ++ prefixUnOpSymbolNames
> ++ postfixOpKeywords
> -- these are the ops with the highest precedence in order
2013-12-17 14:21:43 +01:00
> highPrec = [infixl_ ["."]
2013-12-17 15:15:19 +01:00
> ,infixl_ ["*","/", "%"]
> ,infixl_ ["+", "-"]
> ,infixl_ ["<=",">=","!=","<>","||","like"]
> ]
> -- these are the ops with the lowest precedence in order
> lowPrec = [infix_ ["<",">"]
> ,infixr_ ["="]
> ,infixr_ ["not"]
> ,infixl_ ["and"]
> ,infixl_ ["or"]]
> already = concatMap (map fName) highPrec
> ++ concatMap (map fName) lowPrec
> -- all the other ops have equal precedence and go between the
> -- high and low precedence ops
> defaultPrecOps = filter (`notElem` already) allOps
> -- almost correct, have to do some more work to
> -- get the associativity correct for these operators
> defaultPrec = [infixl_ defaultPrecOps]
> fName (Fixity n _) = n
2013-12-13 11:39:26 +01:00
== value expressions
TODO:
left factor stuff which starts with identifier
This parses most of the value exprs. I'm not sure if factor is the
correct terminology here. The order of the parsers and use of try is
carefully done to make everything work. It is a little fragile and
could at least do with some heavy explanation.
> factor :: P ValueExpr
> factor = choice [literal
> ,parameter
> ,scase
> ,cast
> ,try specialOpKs
> ,subquery
> ,prefixUnaryOp
> ,try app
2013-12-17 11:24:37 +01:00
> ,try star
> ,identifier
> ,sparens]
putting the factor together with the extra bits
> valueExpr'' :: Bool -> P ValueExpr
> valueExpr'' bExpr = factor >>= trysuffix
> where
> trysuffix e = try (suffix e) <|> return e
> suffix e0 = choice
> [binaryOperatorSuffix bExpr e0
> ,inSuffix e0
> ,betweenSuffix e0
> ,postfixOpSuffix e0
> ] >>= trysuffix
Wrapper for non 'bExpr' parsing. See the between parser for
explanation.
> valueExpr' :: P ValueExpr
> valueExpr' = valueExpr'' False
The valueExpr wrapper. The idea is that directly nested value
expressions use the valueExpr' parser, then other code uses the
valueExpr parser and then everyone gets the fixity fixes and it's
easy to ensure that this fix is only applied once to each value
2013-12-14 09:55:44 +01:00
expression tree (for efficiency and code clarity).
2013-12-13 11:39:26 +01:00
> valueExpr :: P ValueExpr
> valueExpr = fixFixities sqlFixities <$> valueExpr'
2013-12-13 11:39:26 +01:00
expose the b expression for window frame clause range between
> valueExprB :: P ValueExpr
> valueExprB = fixFixities sqlFixities <$> valueExpr'' True
2013-12-13 11:39:26 +01:00
-------------------------------------------------
= query expressions
2013-12-14 09:55:44 +01:00
== select lists
2013-12-13 16:27:02 +01:00
> selectItem :: P (Maybe Name, ValueExpr)
> selectItem = flip (,) <$> valueExpr <*> optionMaybe (try als)
> where als = optional (try (keyword_ "as")) *> name
2013-12-13 11:39:26 +01:00
> selectList :: P [(Maybe Name,ValueExpr)]
2013-12-13 16:27:02 +01:00
> selectList = commaSep1 selectItem
2013-12-13 11:39:26 +01:00
2013-12-14 09:55:44 +01:00
== from
2013-12-14 12:05:02 +01:00
Here is the rough grammar for joins
2013-12-14 09:55:44 +01:00
tref
2013-12-14 12:05:02 +01:00
(cross | [natural] ([inner] | (left | right | full) [outer])) join
tref
2013-12-14 09:55:44 +01:00
[on expr | using (...)]
2013-12-13 11:39:26 +01:00
> from :: P [TableRef]
> from = try (keyword_ "from") *> commaSep1 tref
2013-12-13 11:39:26 +01:00
> where
2013-12-14 12:05:02 +01:00
> tref = nonJoinTref >>= optionSuffix joinTrefSuffix
2013-12-14 13:10:46 +01:00
> nonJoinTref = choice [try (TRQueryExpr <$> parens queryExpr)
> ,TRParens <$> parens tref
> ,TRLateral <$> (try (keyword_ "lateral")
> *> nonJoinTref)
2013-12-17 12:21:36 +01:00
> ,try (TRFunction <$> name
> <*> parens (commaSep valueExpr))
2013-12-17 12:21:36 +01:00
> ,TRSimple <$> name]
2013-12-14 12:05:02 +01:00
> >>= optionSuffix aliasSuffix
> aliasSuffix j = option j (TRAlias j <$> alias)
2013-12-14 12:05:02 +01:00
> joinTrefSuffix t = (do
2013-12-14 16:09:45 +01:00
> nat <- option False $ try (True <$ try (keyword_ "natural"))
2013-12-14 13:10:46 +01:00
> TRJoin t <$> joinType
> <*> nonJoinTref
> <*> optionMaybe (joinCondition nat))
2013-12-14 12:05:02 +01:00
> >>= optionSuffix joinTrefSuffix
2013-12-17 11:27:00 +01:00
> joinType =
> choice [choice
> [JCross <$ try (keyword_ "cross")
> ,JInner <$ try (keyword_ "inner")
> ,choice [JLeft <$ try (keyword_ "left")
> ,JRight <$ try (keyword_ "right")
> ,JFull <$ try (keyword_ "full")]
> <* optional (try $ keyword_ "outer")]
> <* keyword "join"
> ,JInner <$ keyword_ "join"]
2013-12-14 12:05:02 +01:00
> joinCondition nat =
> choice [guard nat >> return JoinNatural
> ,try (keyword_ "on") >>
> JoinOn <$> valueExpr
2013-12-14 12:05:02 +01:00
> ,try (keyword_ "using") >>
2013-12-17 12:21:36 +01:00
> JoinUsing <$> parens (commaSep1 name)
2013-12-13 11:39:26 +01:00
> ]
> alias :: P Alias
> alias = Alias <$> try tableAlias <*> try columnAliases
> where
> tableAlias = optional (try $ keyword_ "as") *> name
> columnAliases = optionMaybe $ try $ parens $ commaSep1 name
2013-12-14 09:55:44 +01:00
== simple other parts
Parsers for where, group by, having, order by and limit, which are
pretty trivial.
Here is a helper for parsing a few parts of the query expr (currently
where, having, limit, offset).
> keywordValueExpr :: String -> P ValueExpr
> keywordValueExpr k = try (keyword_ k) *> valueExpr
2013-12-13 16:27:02 +01:00
> swhere :: P ValueExpr
> swhere = keywordValueExpr "where"
2013-12-13 11:39:26 +01:00
> sgroupBy :: P [GroupingExpr]
> sgroupBy = try (keyword_ "group")
> *> keyword_ "by"
2013-12-17 18:27:09 +01:00
> *> commaSep1 groupingExpression
> where
> groupingExpression =
> choice
> [try (keyword_ "cube") >>
> Cube <$> parens (commaSep groupingExpression)
> ,try (keyword_ "rollup") >>
> Rollup <$> parens (commaSep groupingExpression)
> ,GroupingParens <$> parens (commaSep groupingExpression)
> ,try (keyword_ "grouping") >> keyword_ "sets" >>
> GroupingSets <$> parens (commaSep groupingExpression)
> ,SimpleGroup <$> valueExpr
2013-12-17 18:27:09 +01:00
> ]
2013-12-13 11:39:26 +01:00
> having :: P ValueExpr
> having = keywordValueExpr "having"
2013-12-13 11:39:26 +01:00
> orderBy :: P [SortSpec]
2013-12-14 00:14:23 +01:00
> orderBy = try (keyword_ "order") *> keyword_ "by" *> commaSep1 ob
2013-12-13 16:08:10 +01:00
> where
> ob = SortSpec
> <$> valueExpr
2013-12-17 17:28:31 +01:00
> <*> option Asc (choice [Asc <$ keyword_ "asc"
> ,Desc <$ keyword_ "desc"])
> <*> option NullsOrderDefault
> (try (keyword_ "nulls" >>
> choice [NullsFirst <$ keyword "first"
> ,NullsLast <$ keyword "last"]))
2013-12-13 11:39:26 +01:00
allows offset and fetch in either order
+ postgresql offset without row(s) and limit instead of fetch also
> offsetFetch :: P (Maybe ValueExpr, Maybe ValueExpr)
> offsetFetch = permute ((,) <$?> (Nothing, Just <$> offset)
> <|?> (Nothing, Just <$> fetch))
2013-12-13 16:27:02 +01:00
> offset :: P ValueExpr
> offset = try (keyword_ "offset") *> valueExpr
> <* option () (try $ choice [try (keyword_ "rows"),keyword_ "row"])
> fetch :: P ValueExpr
> fetch = choice [ansiFetch, limit]
> where
> ansiFetch = try (keyword_ "fetch") >>
> choice [keyword_ "first",keyword_ "next"]
> *> valueExpr
> <* choice [keyword_ "rows",keyword_ "row"]
> <* keyword_ "only"
> limit = try (keyword_ "limit") *> valueExpr
2013-12-13 16:27:02 +01:00
2013-12-14 09:55:44 +01:00
== common table expressions
> with :: P QueryExpr
> with = try (keyword_ "with") >>
> With <$> option False (try (True <$ keyword_ "recursive"))
> <*> commaSep1 withQuery <*> queryExpr
> where
2013-12-14 00:14:23 +01:00
> withQuery =
> (,) <$> (alias <* optional (try $ keyword_ "as"))
2013-12-14 00:14:23 +01:00
> <*> parens queryExpr
2013-12-13 16:27:02 +01:00
2013-12-14 09:55:44 +01:00
== query expression
This parser parses any query expression variant: normal select, cte,
and union, etc..
2013-12-13 11:39:26 +01:00
> queryExpr :: P QueryExpr
> queryExpr =
> choice [with
2013-12-17 12:58:44 +01:00
> ,choice [values,table, select]
> >>= optionSuffix queryExprSuffix]
> where
> select = try (keyword_ "select") >>
> mkSelect
> <$> (fromMaybe All <$> duplicates)
> <*> selectList
> <*> option [] from
> <*> optionMaybe swhere
> <*> option [] sgroupBy
> <*> optionMaybe having
> <*> option [] orderBy
> <*> offsetFetch
> mkSelect d sl f w g h od (ofs,fe) =
> Select d sl f w g h od ofs fe
2013-12-17 12:27:16 +01:00
> values = try (keyword_ "values")
> >> Values <$> commaSep (parens (commaSep valueExpr))
2013-12-17 12:58:44 +01:00
> table = try (keyword_ "table") >> Table <$> name
2013-12-13 22:41:12 +01:00
> queryExprSuffix :: QueryExpr -> P QueryExpr
> queryExprSuffix qe =
> (CombineQueryExpr qe
> <$> try (choice
> [Union <$ keyword_ "union"
> ,Intersect <$ keyword_ "intersect"
> ,Except <$ keyword_ "except"])
> <*> (fromMaybe All <$> duplicates)
> <*> option Respectively
> (try (Corresponding <$ keyword_ "corresponding"))
> <*> queryExpr)
> >>= optionSuffix queryExprSuffix
2013-12-13 11:39:26 +01:00
2013-12-14 09:55:44 +01:00
wrapper for query expr which ignores optional trailing semicolon.
> topLevelQueryExpr :: P QueryExpr
> topLevelQueryExpr =
2013-12-14 10:59:29 +01:00
> queryExpr >>= optionSuffix ((symbol ";" *>) . return)
2013-12-14 09:55:44 +01:00
wrapper to parse a series of query exprs from a single source. They
must be separated by semicolon, but for the last expression, the
trailing semicolon is optional.
> queryExprs :: P [QueryExpr]
2013-12-14 16:09:45 +01:00
> queryExprs =
> (:[]) <$> queryExpr
2013-12-14 10:59:29 +01:00
> >>= optionSuffix ((symbol ";" *>) . return)
> >>= optionSuffix (\p -> (p++) <$> queryExprs)
2013-12-13 11:39:26 +01:00
------------------------------------------------
2013-12-14 09:55:44 +01:00
= lexing parsers
The lexing is a bit 'virtual', in the usual parsec style. The
convention in this file is to put all the parsers which access
characters directly or indirectly here (i.e. ones which use char,
string, digit, etc.), except for the parsers which only indirectly
access them via these functions, if you follow?
2013-12-14 09:55:44 +01:00
> symbol :: String -> P String
> symbol s = string s
> -- <* notFollowedBy (oneOf "+-/*<>=!|")
> <* whiteSpace
> symbol_ :: String -> P ()
> symbol_ s = symbol s *> return ()
TODO: now that keyword has try in it, a lot of the trys above can be
removed
2013-12-14 09:55:44 +01:00
> keyword :: String -> P String
> keyword s = try $ do
> i <- identifierRaw
> guard (map toLower i == map toLower s)
> return i
2013-12-14 09:55:44 +01:00
> keyword_ :: String -> P ()
> keyword_ s = keyword s *> return ()
Identifiers are very simple at the moment: start with a letter or
underscore, and continue with letter, underscore or digit. It doesn't
support quoting other other sorts of identifiers yet. There is a
blacklist of keywords which aren't supported as identifiers.
the identifier raw doesn't check the blacklist since it is used by the
keyword parser also
> identifierRaw :: P String
> identifierRaw = (:) <$> letterOrUnderscore
> <*> many letterDigitOrUnderscore <* whiteSpace
2013-12-14 09:55:44 +01:00
> where
> letterOrUnderscore = char '_' <|> letter
> letterDigitOrUnderscore = char '_' <|> alphaNum
> identifierString :: P String
> identifierString = do
> s <- identifierRaw
> guard (map toLower s `notElem` blacklist)
> return s
2013-12-14 09:55:44 +01:00
> blacklist :: [String]
2013-12-14 12:05:02 +01:00
> blacklist =
> ["select", "as", "from", "where", "having", "group", "order"
> ,"limit", "offset", "fetch"
2013-12-14 12:05:02 +01:00
> ,"inner", "left", "right", "full", "natural", "join"
2013-12-17 11:45:32 +01:00
> ,"cross", "on", "using", "lateral"
2013-12-14 12:05:02 +01:00
> ,"when", "then", "case", "end", "in"
> ,"except", "intersect", "union"]
2013-12-14 09:55:44 +01:00
2013-12-14 15:58:35 +01:00
These blacklisted names are mostly needed when we parse something with
an optional alias, e.g. select a a from t. If we write select a from
t, we have to make sure the from isn't parsed as an alias. I'm not
sure what other places strictly need the blacklist, and in theory it
could be tuned differently for each place the identifierString/
identifier parsers are used to only blacklist the bare minimum.
2013-12-14 09:55:44 +01:00
2013-12-17 12:21:36 +01:00
> quotedIdentifier :: P String
> quotedIdentifier = char '"' *> manyTill anyChar (symbol_ "\"")
2013-12-14 09:55:44 +01:00
String literals: limited at the moment, no escaping \' or other
variations.
> stringLiteral :: P String
2013-12-17 14:09:28 +01:00
> stringLiteral = (char '\'' *> manyTill anyChar (char '\'')
> >>= optionSuffix moreString) <* whiteSpace
> where
> moreString s0 = try $ do
> void $ char '\''
> s <- manyTill anyChar (char '\'')
> optionSuffix moreString (s0 ++ "'" ++ s)
2013-12-14 09:55:44 +01:00
number literals
here is the rough grammar target:
digits
digits.[digits][e[+-]digits]
[digits].digits[e[+-]digits]
digitse[+-]digits
2013-12-14 16:09:45 +01:00
numbers are parsed to strings, not to a numeric type. This is to avoid
2013-12-14 09:55:44 +01:00
making a decision on how to represent numbers, the client code can
make this choice.
> numberLiteral :: P String
> numberLiteral =
> choice [int
> >>= optionSuffix dot
> >>= optionSuffix fracts
> >>= optionSuffix expon
> ,fract "" >>= optionSuffix expon]
> <* whiteSpace
> where
> int = many1 digit
> fract p = dot p >>= fracts
2013-12-14 10:47:13 +01:00
> dot p = (p++) <$> string "."
2013-12-14 09:55:44 +01:00
> fracts p = (p++) <$> int
2013-12-14 10:47:13 +01:00
> expon p = concat <$> sequence
> [return p
> ,string "e"
> ,option "" (string "+" <|> string "-")
> ,int]
2013-12-14 09:55:44 +01:00
2013-12-14 16:09:45 +01:00
lexer for integer literals which appear in some places in SQL
> integerLiteral :: P Int
> integerLiteral = read <$> many1 digit <* whiteSpace
2013-12-14 09:55:44 +01:00
whitespace parser which skips comments also
2013-12-13 11:39:26 +01:00
> whiteSpace :: P ()
> whiteSpace =
> choice [simpleWhiteSpace *> whiteSpace
> ,lineComment *> whiteSpace
> ,blockComment *> whiteSpace
> ,return ()]
> where
> lineComment = try (string "--")
> *> manyTill anyChar (void (char '\n') <|> eof)
> blockComment = -- no nesting of block comments in SQL
> try (string "/*")
> -- TODO: why is try used herex
> *> manyTill anyChar (try $ string "*/")
> -- use many1 so we can more easily avoid non terminating loops
> simpleWhiteSpace = void $ many1 (oneOf " \t\n")
2013-12-14 09:55:44 +01:00
= generic parser helpers
2013-12-14 12:05:02 +01:00
a possible issue with the option suffix is that it enforces left
associativity when chaining it recursively. Have to review
all these uses and figure out if any should be right associative
instead, and create an alternative suffix parser
2013-12-13 11:39:26 +01:00
> optionSuffix :: (a -> P a) -> a -> P a
> optionSuffix p a = option a (p a)
> parens :: P a -> P a
> parens = between (symbol_ "(") (symbol_ ")")
2013-12-13 11:39:26 +01:00
> commaSep :: P a -> P [a]
> commaSep = (`sepBy` symbol_ ",")
> commaSep1 :: P a -> P [a]
> commaSep1 = (`sepBy1` symbol_ ",")
2013-12-14 00:14:23 +01:00
--------------------------------------------
= helper functions
> setPos :: Maybe (Int,Int) -> P ()
> setPos Nothing = return ()
> setPos (Just (l,c)) = fmap f getPosition >>= setPosition
> where f = flip setSourceColumn c
> . flip setSourceLine l
> convParseError :: String -> P.ParseError -> ParseError
> convParseError src e =
> ParseError
> {peErrorString = show e
> ,peFilename = sourceName p
> ,pePosition = (sourceLine p, sourceColumn p)
> ,peFormattedError = formatError src e
> }
> where
> p = errorPos e
format the error more nicely: emacs format for positioning, plus
context
> formatError :: String -> P.ParseError -> String
> formatError src e =
> sourceName p ++ ":" ++ show (sourceLine p)
> ++ ":" ++ show (sourceColumn p) ++ ":"
> ++ context
> ++ show e
> where
> context =
> let lns = take 1 $ drop (sourceLine p - 1) $ lines src
> in case lns of
> [x] -> "\n" ++ x ++ "\n"
> ++ replicate (sourceColumn p - 1) ' ' ++ "^\n"
> _ -> ""
> p = errorPos e