> {-# LANGUAGE TupleSections #-} > -- | This is the module with the parser functions. > module Language.SQL.SimpleSQL.Parser > (parseQueryExpr > ,parseValueExpr > ,parseQueryExprs > ,ParseError(..)) where > import Control.Monad.Identity > import Control.Applicative hiding (many, (<|>), optional) > import Data.Maybe > import Data.Char > import Text.Parsec hiding (ParseError) > import qualified Text.Parsec as P > import Text.Parsec.Perm > import Language.SQL.SimpleSQL.Syntax > import Language.SQL.SimpleSQL.Fixity The public API functions. > -- | Parses a query expr, trailing semicolon optional. > parseQueryExpr :: FilePath > -- ^ filename to use in errors > -> Maybe (Int,Int) > -- ^ line number and column number to use in errors > -> String > -- ^ the SQL source to parse > -> Either ParseError QueryExpr > parseQueryExpr = wrapParse topLevelQueryExpr > -- | Parses a list of query exprs, with semi colons between > -- them. The final semicolon is optional. > parseQueryExprs :: FilePath > -- ^ filename to use in errors > -> Maybe (Int,Int) > -- ^ line number and column number to use in errors > -> String > -- ^ the SQL source to parse > -> Either ParseError [QueryExpr] > parseQueryExprs = wrapParse queryExprs > -- | Parses a value expression. > parseValueExpr :: FilePath > -- ^ filename to use in errors > -> Maybe (Int,Int) > -- ^ line number and column number to use in errors > -> String > -- ^ the SQL source to parse > -> Either ParseError ValueExpr > parseValueExpr = wrapParse valueExpr This helper function takes the parser given and: sets the position when parsing automatically skips leading whitespace checks the parser parses all the input using eof converts the error return to the nice wrapper > wrapParse :: P a > -> FilePath > -> Maybe (Int,Int) > -> String > -> Either ParseError a > wrapParse parser f p src = > either (Left . convParseError src) Right > $ parse (setPos p *> whiteSpace *> parser <* eof) f src > -- | Type to represent parse errors. > data ParseError = ParseError > {peErrorString :: String > -- ^ contains the error message > ,peFilename :: FilePath > -- ^ filename location for the error > ,pePosition :: (Int,Int) > -- ^ line number and column number location for the error > ,peFormattedError :: String > -- ^ formatted error with the position, error > -- message and source context > } deriving (Eq,Show) ------------------------------------------------ > type P a = ParsecT String () Identity a = value expressions == literals See the stringLiteral lexer below for notes on string literal syntax. > estring :: P ValueExpr > estring = StringLit <$> stringLiteral > number :: P ValueExpr > number = NumLit <$> numberLiteral parse SQL interval literals, something like interval '5' day (3) or interval '5' month wrap the whole lot in try, in case we get something like this: interval '3 days' which parses as a typed literal > interval :: P ValueExpr > interval = try (keyword_ "interval" >> > IntervalLit > <$> stringLiteral > <*> identifierString > <*> optionMaybe (try $ parens integerLiteral)) > literal :: P ValueExpr > literal = number <|> estring <|> interval == identifiers Uses the identifierString 'lexer'. See this function for notes on identifiers. > name :: P Name > name = choice [QName <$> quotedIdentifier > ,Name <$> identifierString] > identifier :: P ValueExpr > identifier = Iden <$> name == star used in select *, select x.*, and agg(*) variations, and some other places as well. Because it is quite general, the parser doesn't attempt to check that the star is in a valid context, it parses it OK in any value expression context. > star :: P ValueExpr > star = Star <$ symbol "*" == parameter use in e.g. select * from t where a = ? > parameter :: P ValueExpr > parameter = Parameter <$ symbol "?" == function application, aggregates and windows this represents anything which syntactically looks like regular C function application: an identifier, parens with comma sep value expression arguments. The parsing for the aggregate extensions is here as well: aggregate([all|distinct] args [order by orderitems]) > aggOrApp :: P ValueExpr > aggOrApp = > makeApp > <$> name > <*> parens ((,,) <$> try duplicates > <*> choice [commaSep valueExpr'] > <*> try (optionMaybe orderBy)) > where > makeApp i (Nothing,es,Nothing) = App i es > makeApp i (d,es,od) = AggregateApp i d es (fromMaybe [] od) > duplicates :: P (Maybe SetQuantifier) > duplicates = optionMaybe $ try $ > choice [All <$ keyword_ "all" > ,Distinct <$ keyword "distinct"] parse a window call as a suffix of a regular function call this looks like this: functionname(args) over ([partition by ids] [order by orderitems]) No support for explicit frames yet. The convention in this file is that the 'Suffix', erm, suffix on parser names means that they have been left factored. These are almost always used with the optionSuffix combinator. > windowSuffix :: ValueExpr -> P ValueExpr > windowSuffix (App f es) = > try (keyword_ "over") > *> parens (WindowApp f es > <$> option [] partitionBy > <*> option [] orderBy > <*> optionMaybe frameClause) > where > partitionBy = try (keyword_ "partition") >> > keyword_ "by" >> commaSep1 valueExpr' > frameClause = > mkFrame <$> choice [FrameRows <$ keyword_ "rows" > ,FrameRange <$ keyword_ "range"] > <*> frameStartEnd > frameStartEnd = > choice > [try (keyword_ "between") >> > mkFrameBetween <$> frameLimit True > <*> (keyword_ "and" *> frameLimit True) > ,mkFrameFrom <$> frameLimit False] > -- use the bexpression style from the between parsing for frame between > frameLimit useB = > choice > [Current <$ try (keyword_ "current") <* keyword_ "row" > ,try (keyword_ "unbounded") >> > choice [UnboundedPreceding <$ keyword_ "preceding" > ,UnboundedFollowing <$ keyword_ "following"] > ,do > e <- if useB then valueExprB else valueExpr > choice [Preceding e <$ keyword_ "preceding" > ,Following e <$ keyword_ "following"] > ] > mkFrameBetween s e rs = FrameBetween rs s e > mkFrameFrom s rs = FrameFrom rs s > mkFrame rs c = c rs > windowSuffix _ = fail "" > app :: P ValueExpr > app = aggOrApp >>= optionSuffix windowSuffix == case expression > scase :: P ValueExpr > scase = > Case <$> (try (keyword_ "case") *> optionMaybe (try valueExpr')) > <*> many1 swhen > <*> optionMaybe (try (keyword_ "else") *> valueExpr') > <* keyword_ "end" > where > swhen = keyword_ "when" *> > ((,) <$> commaSep1 valueExpr' > <*> (keyword_ "then" *> valueExpr')) == miscellaneous keyword operators These are keyword operators which don't look like normal prefix, postfix or infix binary operators. They mostly look like function application but with keywords in the argument list instead of commas to separate the arguments. cast: cast(expr as type) > cast :: P ValueExpr > cast = parensCast <|> prefixCast > where > parensCast = try (keyword_ "cast") >> > parens (Cast <$> valueExpr' > <*> (keyword_ "as" *> typeName)) > prefixCast = try (TypedLit <$> typeName > <*> stringLiteral) the special op keywords parse an operator which is operatorname(firstArg keyword0 arg0 keyword1 arg1 etc.) > data SpecialOpKFirstArg = SOKNone > | SOKOptional > | SOKMandatory > specialOpK :: String -- name of the operator > -> SpecialOpKFirstArg -- has a first arg without a keyword > -> [(String,Bool)] -- the other args with their keywords > -- and whether they are optional > -> P ValueExpr > specialOpK opName firstArg kws = > keyword_ opName >> do > void $ symbol "(" > let pfa = do > e <- valueExpr' > -- check we haven't parsed the first > -- keyword as an identifier > guard (case (e,kws) of > (Iden (Name i), ((k,_):_)) | map toLower i == k -> False > _ -> True) > return e > fa <- case firstArg of > SOKNone -> return Nothing > SOKOptional -> optionMaybe (try pfa) > SOKMandatory -> Just <$> pfa > as <- mapM parseArg kws > void $ symbol ")" > return $ SpecialOpK (Name opName) fa $ catMaybes as > where > parseArg (nm,mand) = > let p = keyword_ nm >> valueExpr' > in fmap (nm,) <$> if mand > then Just <$> p > else optionMaybe (try p) The actual operators: EXTRACT( date_part FROM expression ) POSITION( string1 IN string2 ) SUBSTRING(extraction_string FROM starting_position [FOR length] [COLLATE collation_name]) CONVERT(char_value USING conversion_char_name) TRANSLATE(char_value USING translation_name) OVERLAY(string PLACING embedded_string FROM start [FOR length]) TRIM( [ [{LEADING | TRAILING | BOTH}] [removal_char] FROM ] target_string [COLLATE collation_name] ) > specialOpKs :: P ValueExpr > specialOpKs = choice $ map try > [extract, position, substring, convert, translate, overlay, trim] > extract :: P ValueExpr > extract = specialOpK "extract" SOKMandatory [("from", True)] > position :: P ValueExpr > position = specialOpK "position" SOKMandatory [("in", True)] strictly speaking, the substring must have at least one of from and for, but the parser doens't enforce this > substring :: P ValueExpr > substring = specialOpK "substring" SOKMandatory > [("from", False),("for", False),("collate", False)] > convert :: P ValueExpr > convert = specialOpK "convert" SOKMandatory [("using", True)] > translate :: P ValueExpr > translate = specialOpK "translate" SOKMandatory [("using", True)] > overlay :: P ValueExpr > overlay = specialOpK "overlay" SOKMandatory > [("placing", True),("from", True),("for", False)] trim is too different because of the optional char, so a custom parser the both ' ' is filled in as the default if either parts are missing in the source > trim :: P ValueExpr > trim = > keyword "trim" >> > parens (mkTrim > <$> option "both" sides > <*> option " " stringLiteral > <*> (keyword_ "from" *> valueExpr') > <*> optionMaybe (keyword_ "collate" *> stringLiteral)) > where > sides = choice ["leading" <$ keyword_ "leading" > ,"trailing" <$ keyword_ "trailing" > ,"both" <$ keyword_ "both"] > mkTrim fa ch fr cl = > SpecialOpK (Name "trim") Nothing > $ catMaybes [Just (fa,StringLit ch) > ,Just ("from", fr) > ,fmap (("collate",) . StringLit) cl] in: two variations: a in (expr0, expr1, ...) a in (queryexpr) > inSuffix :: ValueExpr -> P ValueExpr > inSuffix e = > In <$> inty > <*> return e > <*> parens (choice > [InQueryExpr <$> queryExpr > ,InList <$> commaSep1 valueExpr']) > where > inty = try $ choice [True <$ keyword_ "in" > ,False <$ keyword_ "not" <* keyword_ "in"] between: expr between expr and expr There is a complication when parsing between - when parsing the second expression it is ambiguous when you hit an 'and' whether it is a binary operator or part of the between. This code follows what postgres does, which might be standard across SQL implementations, which is that you can't have a binary and operator in the middle expression in a between unless it is wrapped in parens. The 'bExpr parsing' is used to create alternative value expression parser which is identical to the normal one expect it doesn't recognise the binary and operator. This is the call to valueExpr'' True. > betweenSuffix :: ValueExpr -> P ValueExpr > betweenSuffix e = > makeOp <$> (Name <$> opName) > <*> return e > <*> valueExpr'' True > <*> (keyword_ "and" *> valueExpr'' True) > where > opName = try $ choice > ["between" <$ keyword_ "between" > ,"not between" <$ keyword_ "not" <* keyword_ "between"] > makeOp n a b c = SpecialOp n [a,b,c] subquery expression: [exists|all|any|some] (queryexpr) > subquery :: P ValueExpr > subquery = > choice > [try $ SubQueryExpr SqSq <$> parens queryExpr > ,SubQueryExpr <$> try sqkw <*> parens queryExpr] > where > sqkw = try $ choice > [SqExists <$ keyword_ "exists" > ,SqAll <$ try (keyword_ "all") > ,SqAny <$ keyword_ "any" > ,SqSome <$ keyword_ "some"] typename: used in casts. Special cases for the multi keyword typenames that SQL supports. > typeName :: P TypeName > typeName = choice (multiWordParsers > ++ [TypeName <$> identifierString]) > >>= optionSuffix precision > where > multiWordParsers = > flip map multiWordTypeNames > $ \ks -> (TypeName . unwords) <$> try (mapM keyword ks) > multiWordTypeNames = map words > ["double precision" > ,"character varying" > ,"char varying" > ,"character large object" > ,"char large object" > ,"national character" > ,"national char" > ,"national character varying" > ,"national char varying" > ,"national character large object" > ,"nchar large object" > ,"nchar varying" > ,"bit varying" > ] todo: timestamp types: | TIME [