add separate lexer
This commit is contained in:
parent
1364c58534
commit
2df76e3095
7 changed files with 639 additions and 182 deletions
Language/SQL/SimpleSQL
|
@ -17,9 +17,8 @@
|
|||
|
||||
> import Control.Applicative ((<$>), (<*>), (<**>), pure, Applicative)
|
||||
> import Text.Parsec (option,many)
|
||||
> import Text.Parsec.Prim (Parsec)
|
||||
> import Text.Parsec.String (GenParser)
|
||||
|
||||
> type Parser s = Parsec String s
|
||||
|
||||
a possible issue with the option suffix is that it enforces left
|
||||
associativity when chaining it recursively. Have to review
|
||||
|
@ -29,7 +28,7 @@ instead, and create an alternative suffix parser
|
|||
This function style is not good, and should be replaced with chain and
|
||||
<??> which has a different type
|
||||
|
||||
> optionSuffix :: (a -> Parser s a) -> a -> Parser s a
|
||||
> optionSuffix :: (a -> GenParser t s a) -> a -> GenParser t s a
|
||||
> optionSuffix p a = option a (p a)
|
||||
|
||||
|
||||
|
@ -39,7 +38,7 @@ hand result, taken from uu-parsinglib
|
|||
TODO: make sure the precedence higher than <|> and lower than the
|
||||
other operators so it can be used nicely
|
||||
|
||||
> (<??>) :: Parser s a -> Parser s (a -> a) -> Parser s a
|
||||
> (<??>) :: GenParser t s a -> GenParser t s (a -> a) -> GenParser t s a
|
||||
> p <??> q = p <**> option id q
|
||||
|
||||
|
||||
|
@ -80,7 +79,7 @@ composing suffix parsers, not sure about the name. This is used to add
|
|||
a second or more suffix parser contingent on the first suffix parser
|
||||
succeeding.
|
||||
|
||||
> (<??.>) :: Parser s (a -> a) -> Parser s (a -> a) -> Parser s (a -> a)
|
||||
> (<??.>) :: GenParser t s (a -> a) -> GenParser t s (a -> a) -> GenParser t s (a -> a)
|
||||
> (<??.>) pa pb = (.) `c` pa <*> option id pb
|
||||
> -- todo: fix this mess
|
||||
> where c = (<$>) . flip
|
||||
|
@ -88,7 +87,7 @@ succeeding.
|
|||
|
||||
0 to many repeated applications of suffix parser
|
||||
|
||||
> (<??*>) :: Parser s a -> Parser s (a -> a) -> Parser s a
|
||||
> (<??*>) :: GenParser t s a -> GenParser t s (a -> a) -> GenParser t s a
|
||||
> p <??*> q = foldr ($) <$> p <*> (reverse <$> many q)
|
||||
|
||||
|
||||
|
|
296
Language/SQL/SimpleSQL/Lexer.lhs
Normal file
296
Language/SQL/SimpleSQL/Lexer.lhs
Normal file
|
@ -0,0 +1,296 @@
|
|||
|
||||
Lexer TODO:
|
||||
|
||||
left factor to get rid of trys
|
||||
|
||||
> -- | This is the module contains a Lexer for SQL.
|
||||
> {-# LANGUAGE TupleSections #-}
|
||||
> module Language.SQL.SimpleSQL.Lexer
|
||||
> (lexSQL
|
||||
> ,Token(..)
|
||||
> ,prettyToken
|
||||
> ,prettyTokens
|
||||
> ,Position
|
||||
> ,ParseError(..)
|
||||
> ,Dialect(..)) where
|
||||
|
||||
> import Language.SQL.SimpleSQL.Syntax (Dialect(..))
|
||||
|
||||
> import Text.Parsec (option,string,manyTill,anyChar
|
||||
> ,try,string,many1,oneOf,digit,(<|>),choice,char,eof
|
||||
> ,many,runParser,lookAhead,satisfy)
|
||||
> import Language.SQL.SimpleSQL.Combinators
|
||||
> import Language.SQL.SimpleSQL.Errors
|
||||
> import Control.Applicative hiding ((<|>), many)
|
||||
> import Data.Char
|
||||
> import Control.Monad
|
||||
> import Prelude hiding (takeWhile)
|
||||
> import Text.Parsec.String (Parser)
|
||||
|
||||
|
||||
> -- | Represents a lexed token
|
||||
> data Token
|
||||
> -- | a symbol is one of the following
|
||||
> -- * multi char symbols <> <= >= != ||
|
||||
> -- * single char symbols: * + - < > ^ / % ~ & | ? ( ) [ ] , ; ( )
|
||||
> --
|
||||
> = Symbol String
|
||||
>
|
||||
> -- | This is an identifier or keyword.
|
||||
> --
|
||||
> | Identifier String
|
||||
>
|
||||
> -- | This is an identifier quoted with "
|
||||
> | QIdentifier String
|
||||
> -- | This is an identifier quoted with u&"
|
||||
> | UQIdentifier String
|
||||
|
||||
> -- | This is a dialect specific quoted identifier with the quote
|
||||
> -- characters explicit. The first and second fields are the
|
||||
> -- starting and ending quote characters.n
|
||||
> | DQIdentifier String String String
|
||||
>
|
||||
> -- | This is a host param symbol, e.g. :param
|
||||
> | HostParam String
|
||||
>
|
||||
> -- | This is a string literal.
|
||||
> | SqlString String
|
||||
>
|
||||
> -- | This is a character set string literal. The first field is
|
||||
> -- the charatecter set (one of nNbBxX).
|
||||
> | CSSqlString String String
|
||||
>
|
||||
> -- | a number literal (integral or otherwise), stored in original format
|
||||
> -- unchanged
|
||||
> | SqlNumber String
|
||||
>
|
||||
> -- | non-significant whitespace (space, tab, newline) (strictly speaking,
|
||||
> -- it is up to the client to decide whether the whitespace is significant
|
||||
> -- or not)
|
||||
> | Whitespace String
|
||||
>
|
||||
> -- | a commented line using --, contains every character starting with the
|
||||
> -- \'--\' and including the terminating newline character if there is one
|
||||
> -- - this will be missing if the last line in the source is a line comment
|
||||
> -- with no trailing newline
|
||||
> | LineComment String
|
||||
>
|
||||
> -- | a block comment, \/* stuff *\/, includes the comment delimiters
|
||||
> | BlockComment String
|
||||
>
|
||||
> deriving (Eq,Show)
|
||||
|
||||
|
||||
|
||||
> -- | Accurate pretty printing, if you lex a bunch of tokens,
|
||||
> -- then pretty print them, should should get back exactly the
|
||||
> -- same string
|
||||
> prettyToken :: Dialect -> Token -> String
|
||||
> prettyToken _ (Symbol s) = s
|
||||
> prettyToken _ (Identifier t) = t
|
||||
> prettyToken _ (QIdentifier t) =
|
||||
> "\"" ++ doubleChars '"' t ++ "\""
|
||||
> prettyToken _ (UQIdentifier t) =
|
||||
> "u&\"" ++ doubleChars '"' t ++ "\""
|
||||
> prettyToken _ (DQIdentifier s e t) =
|
||||
> s ++ t ++ e
|
||||
> prettyToken _ (HostParam p) = ':':p
|
||||
> prettyToken _ (SqlString t) = "'" ++ doubleChars '\'' t ++ "'"
|
||||
> prettyToken _ (CSSqlString cs t) = cs ++ "'" ++ t ++ "'"
|
||||
> prettyToken _ (SqlNumber r) = r
|
||||
> prettyToken _ (Whitespace t) = t
|
||||
> prettyToken _ (LineComment l) = l
|
||||
> prettyToken _ (BlockComment c) = c
|
||||
|
||||
> prettyTokens :: Dialect -> [Token] -> String
|
||||
> prettyTokens d ts = concat $ map (prettyToken d) ts
|
||||
|
||||
> doubleChars :: Char -> String -> String
|
||||
> doubleChars _ [] = []
|
||||
> doubleChars c (d:ds) | c == d = c:d:doubleChars c ds
|
||||
> | otherwise = d:doubleChars c ds
|
||||
|
||||
TODO: try to make all parsers applicative only
|
||||
|
||||
> type Position = (String,Int,Int)
|
||||
|
||||
> addPosition :: Position -> String -> Position
|
||||
> addPosition = addPosition'
|
||||
|
||||
> addPosition' :: Position -> String -> Position
|
||||
> addPosition' (f,l,c) [] = (f,l,c)
|
||||
> addPosition' (f,l,_) ('\n':xs) = addPosition' (f,l+1,0) xs
|
||||
> addPosition' (f,l,c) (_:xs) = addPosition' (f,l,c+1) xs
|
||||
|
||||
|
||||
|
||||
> lexSQL :: Dialect -> Position -> String -> Either ParseError [(Position,Token)]
|
||||
> lexSQL dialect pos@(fn,_,_) txt =
|
||||
> either (Left . convParseError fn) Right
|
||||
> $ runParser (many_p pos <* eof) () "" txt
|
||||
> where
|
||||
|
||||
> many_p pos' = some_p pos' `mplus` return []
|
||||
> some_p pos' = do
|
||||
> tok <- sqlToken dialect pos'
|
||||
> let pos'' = advancePos dialect pos' (snd tok)
|
||||
> (tok:) <$> many_p pos''
|
||||
|
||||
> advancePos :: Dialect -> Position -> Token -> Position
|
||||
> advancePos dialect pos tok =
|
||||
> let pt = prettyToken dialect tok
|
||||
> in addPosition pos pt
|
||||
|
||||
> -- | parser for a sql token
|
||||
> sqlToken :: Dialect -> Position -> Parser (Position,Token)
|
||||
> sqlToken d p =
|
||||
> (p,) <$> choice [sqlString d
|
||||
> ,identifier d
|
||||
> ,hostParam d
|
||||
> ,lineComment d
|
||||
> ,blockComment d
|
||||
> ,sqlNumber d
|
||||
> ,symbol d
|
||||
> ,sqlWhitespace d]
|
||||
|
||||
> takeWhile1 :: (Char -> Bool) -> Parser String
|
||||
> takeWhile1 p = many1 (satisfy p)
|
||||
|
||||
> takeWhile :: (Char -> Bool) -> Parser String
|
||||
> takeWhile p = many (satisfy p)
|
||||
|
||||
> takeTill :: (Char -> Bool) -> Parser String
|
||||
> takeTill p =
|
||||
> try (manyTill anyChar (peekSatisfy p))
|
||||
|
||||
> peekSatisfy :: (Char -> Bool) -> Parser ()
|
||||
> peekSatisfy p = do
|
||||
> void $ try $ lookAhead (satisfy p)
|
||||
|
||||
> identifier :: Dialect -> Parser Token
|
||||
> identifier d =
|
||||
> choice
|
||||
> [QIdentifier <$> qiden
|
||||
> ,UQIdentifier <$> ((try (string "u&" <|> string "U&")) *> qiden)
|
||||
> ,Identifier <$> identifierString
|
||||
> ,DQIdentifier "`" "`" <$> mySqlQIden
|
||||
> ]
|
||||
> where
|
||||
> qiden = char '"' *> qidenSuffix ""
|
||||
> qidenSuffix t = do
|
||||
> s <- takeTill (=='"')
|
||||
> void $ char '"'
|
||||
> -- deal with "" as literal double quote character
|
||||
> choice [do
|
||||
> void $ char '"'
|
||||
> qidenSuffix $ concat [t,s,"\""]
|
||||
> ,return $ concat [t,s]]
|
||||
> -- mysql can quote identifiers with `
|
||||
> mySqlQIden = do
|
||||
> guard (d == MySQL)
|
||||
> char '`' *> takeWhile1 (/='`') <* char '`'
|
||||
|
||||
> identifierString :: Parser String
|
||||
> identifierString =
|
||||
> startsWith (\c -> c == '_' || isAlpha c)
|
||||
> (\c -> c == '_' || isAlphaNum c)
|
||||
|
||||
|
||||
> sqlString :: Dialect -> Parser Token
|
||||
> sqlString _ =
|
||||
> choice [csString
|
||||
> ,normalString
|
||||
> ]
|
||||
> where
|
||||
> normalString = SqlString {-"'"-} <$> (char '\'' *> normalStringSuffix "")
|
||||
> normalStringSuffix t = do
|
||||
> s <- takeTill (=='\'')
|
||||
> void $ char '\''
|
||||
> -- deal with '' as literal quote character
|
||||
> choice [do
|
||||
> void $ char '\''
|
||||
> normalStringSuffix $ concat [t,s,"'"]
|
||||
> ,return $ concat [t,s]]
|
||||
> csString = CSSqlString <$> try (cs <* char '\'') <*> normalStringSuffix ""
|
||||
> cs = choice [(:[]) <$> oneOf "nNbBxX"
|
||||
> ,string "u&"
|
||||
> ,string "U&"]
|
||||
|
||||
> hostParam :: Dialect -> Parser Token
|
||||
> hostParam _ = HostParam <$> (char ':' *> identifierString)
|
||||
|
||||
|
||||
> sqlNumber :: Dialect -> Parser Token
|
||||
> sqlNumber _ = SqlNumber <$>
|
||||
> (int <??> (pp dot <??.> pp int)
|
||||
> <|> try ((++) <$> dot <*> int))
|
||||
> <??> pp expon
|
||||
> where
|
||||
> int = many1 digit
|
||||
> dot = string "."
|
||||
> expon = (:) <$> oneOf "eE" <*> sInt
|
||||
> sInt = (++) <$> option "" (string "+" <|> string "-") <*> int
|
||||
> pp = (<$$> (++))
|
||||
|
||||
digits
|
||||
digits.[digits][e[+-]digits]
|
||||
[digits].digits[e[+-]digits]
|
||||
digitse[+-]digits
|
||||
|
||||
where digits is one or more decimal digits (0 through 9). At least one
|
||||
digit must be before or after the decimal point, if one is used. At
|
||||
least one digit must follow the exponent marker (e), if one is
|
||||
present. There cannot be any spaces or other characters embedded in
|
||||
the constant. Note that any leading plus or minus sign is not actually
|
||||
considered part of the constant; it is an operator applied to the
|
||||
constant.
|
||||
|
||||
|
||||
A symbol is one of the two character symbols, or one of the single
|
||||
character symbols in the two lists below.
|
||||
|
||||
> symbol :: Dialect -> Parser Token
|
||||
> symbol _ = Symbol <$> choice (many1 (char '.') :
|
||||
> map (try . string) [">=","<=","!=","<>","||"]
|
||||
> ++ map (string . (:[])) "+-^*/%~&|?<>[]=,;()")
|
||||
|
||||
> sqlWhitespace :: Dialect -> Parser Token
|
||||
> sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
|
||||
|
||||
> lineComment :: Dialect -> Parser Token
|
||||
> lineComment _ =
|
||||
> (\s -> LineComment $ concat ["--",s]) <$>
|
||||
> (try (string "--") *> choice
|
||||
> [flip snoc '\n' <$> takeTill (=='\n') <* char '\n'
|
||||
> ,takeWhile (/='\n') <* eof
|
||||
> ])
|
||||
> where
|
||||
> snoc :: String -> Char -> String
|
||||
> snoc s a = s ++ [a]
|
||||
|
||||
> blockComment :: Dialect -> Parser Token
|
||||
> blockComment _ =
|
||||
> (\s -> BlockComment $ concat ["/*",s]) <$>
|
||||
> (try (string "/*") *> commentSuffix 0)
|
||||
> where
|
||||
> commentSuffix :: Int -> Parser String
|
||||
> commentSuffix n = do
|
||||
> -- read until a possible end comment or nested comment
|
||||
> x <- takeWhile (\e -> e /= '/' && e /= '*')
|
||||
> choice [-- close comment: if the nesting is 0, done
|
||||
> -- otherwise recurse on commentSuffix
|
||||
> try (string "*/") *> let t = concat [x,"*/"]
|
||||
> in if n == 0
|
||||
> then return t
|
||||
> else (\s -> concat [t,s]) <$> commentSuffix (n - 1)
|
||||
> -- nested comment, recurse
|
||||
> ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
|
||||
> -- not an end comment or nested comment, continue
|
||||
> ,(:) <$> anyChar <*> commentSuffix n]
|
||||
|
||||
|
||||
> startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
|
||||
> startsWith p ps = do
|
||||
> c <- satisfy p
|
||||
> choice [(:) c <$> (takeWhile1 ps)
|
||||
> ,return [c]]
|
|
@ -47,7 +47,7 @@ isn't done because of the impact on the parser error
|
|||
messages. Apparently it can also help the speed but this hasn't been
|
||||
looked into.
|
||||
|
||||
== Parser rrror messages
|
||||
== Parser error messages
|
||||
|
||||
A lot of care has been given to generating good parser error messages
|
||||
for invalid syntax. There are a few utils below which partially help
|
||||
|
@ -185,22 +185,25 @@ fixing them in the syntax but leaving them till the semantic checking
|
|||
> import Control.Monad.Identity (Identity)
|
||||
> import Control.Monad (guard, void, when)
|
||||
> import Control.Applicative ((<$), (<$>), (<*>) ,(<*), (*>), (<**>), pure)
|
||||
> import Data.Maybe (catMaybes)
|
||||
> import Data.Char (toLower)
|
||||
> import Data.Char (toLower, isDigit)
|
||||
> import Text.Parsec (setPosition,setSourceColumn,setSourceLine,getPosition
|
||||
> ,option,between,sepBy,sepBy1,string,manyTill,anyChar
|
||||
> ,try,string,many1,oneOf,digit,(<|>),choice,char,eof
|
||||
> ,optionMaybe,optional,many,letter,runParser
|
||||
> ,chainl1, chainr1,(<?>) {-,notFollowedBy,alphaNum-}, lookAhead)
|
||||
> ,option,between,sepBy,sepBy1
|
||||
> ,try,many1,(<|>),choice,eof
|
||||
> ,optionMaybe,optional,runParser
|
||||
> ,chainl1, chainr1,(<?>))
|
||||
> -- import Text.Parsec.String (Parser)
|
||||
> import Text.Parsec.Perm (permute,(<$?>), (<|?>))
|
||||
> import Text.Parsec.Prim (Parsec, getState)
|
||||
> import Text.Parsec.Prim (getState, token)
|
||||
> import Text.Parsec.Pos (newPos)
|
||||
> import qualified Text.Parsec.Expr as E
|
||||
> import Data.List (intercalate,sort,groupBy)
|
||||
> import Data.Function (on)
|
||||
> import Language.SQL.SimpleSQL.Syntax
|
||||
> import Language.SQL.SimpleSQL.Combinators
|
||||
> import Language.SQL.SimpleSQL.Errors
|
||||
> import qualified Language.SQL.SimpleSQL.Lexer as L
|
||||
> import Data.Maybe
|
||||
> import Text.Parsec.String (GenParser)
|
||||
|
||||
= Public API
|
||||
|
||||
|
@ -257,14 +260,21 @@ converts the error return to the nice wrapper
|
|||
> -> Maybe (Int,Int)
|
||||
> -> String
|
||||
> -> Either ParseError a
|
||||
> wrapParse parser d f p src =
|
||||
> wrapParse parser d f p src = do
|
||||
> let (l,c) = fromMaybe (1,0) p
|
||||
> lx <- L.lexSQL d (f,l,c) src
|
||||
> either (Left . convParseError src) Right
|
||||
> $ runParser (setPos p *> whitespace *> parser <* eof)
|
||||
> d f src
|
||||
> $ runParser (setPos p *> parser <* eof)
|
||||
> d f $ filter keep lx
|
||||
> where
|
||||
> setPos Nothing = pure ()
|
||||
> setPos (Just (l,c)) = fmap up getPosition >>= setPosition
|
||||
> where up = flip setSourceColumn c . flip setSourceLine l
|
||||
> keep (_,L.Whitespace {}) = False
|
||||
> keep (_,L.LineComment {}) = False
|
||||
> keep (_,L.BlockComment {}) = False
|
||||
> keep _ = True
|
||||
|
||||
|
||||
------------------------------------------------
|
||||
|
||||
|
@ -304,15 +314,11 @@ u&"example quoted"
|
|||
> name :: Parser Name
|
||||
> name = do
|
||||
> d <- getState
|
||||
> choice [QName <$> quotedIdentifier
|
||||
> ,UQName <$> uquotedIdentifier
|
||||
> choice [QName <$> qidentifierTok
|
||||
> ,UQName <$> uqidentifierTok
|
||||
> ,Name <$> identifierBlacklist (blacklist d)
|
||||
> ,dqName]
|
||||
> where
|
||||
> dqName = guardDialect [MySQL] *>
|
||||
> lexeme (DQName "`" "`"
|
||||
> <$> (char '`'
|
||||
> *> manyTill anyChar (char '`')))
|
||||
> ,(\(s,e,t) -> DQName s e t) <$> dqidentifierTok
|
||||
> ]
|
||||
|
||||
todo: replace (:[]) with a named function all over
|
||||
|
||||
|
@ -433,7 +439,7 @@ Unfortunately, to improve the error messages, there is a lot of (left)
|
|||
factoring in this function, and it is a little dense.
|
||||
|
||||
> typeName :: Parser TypeName
|
||||
> typeName = lexeme $
|
||||
> typeName =
|
||||
> (rowTypeName <|> intervalTypeName <|> otherTypeName)
|
||||
> <??*> tnSuffix
|
||||
> where
|
||||
|
@ -536,20 +542,13 @@ factoring in this function, and it is a little dense.
|
|||
See the stringToken lexer below for notes on string literal syntax.
|
||||
|
||||
> stringLit :: Parser ValueExpr
|
||||
> stringLit = StringLit <$> stringToken
|
||||
> stringLit = StringLit <$> stringTokExtend
|
||||
|
||||
> numberLit :: Parser ValueExpr
|
||||
> numberLit = NumLit <$> numberLiteral
|
||||
> numberLit = NumLit <$> sqlNumberTok
|
||||
|
||||
> characterSetLit :: Parser ValueExpr
|
||||
> characterSetLit =
|
||||
> CSStringLit <$> shortCSPrefix <*> stringToken
|
||||
> where
|
||||
> shortCSPrefix = try $ choice
|
||||
> [(:[]) <$> oneOf "nNbBxX"
|
||||
> ,string "u&"
|
||||
> ,string "U&"
|
||||
> ] <* lookAhead quote
|
||||
> characterSetLit = uncurry CSStringLit <$> csSqlStringLitTok
|
||||
|
||||
> simpleLiteral :: Parser ValueExpr
|
||||
> simpleLiteral = numberLit <|> stringLit <|> characterSetLit
|
||||
|
@ -575,8 +574,8 @@ select x from t where x > :param
|
|||
> parameter = choice
|
||||
> [Parameter <$ questionMark
|
||||
> ,HostParameter
|
||||
> <$> hostParameterToken
|
||||
> <*> optionMaybe (keyword "indicator" *> hostParameterToken)]
|
||||
> <$> hostParamTok
|
||||
> <*> optionMaybe (keyword "indicator" *> hostParamTok)]
|
||||
|
||||
== parens
|
||||
|
||||
|
@ -675,7 +674,7 @@ this. also fix the monad -> applicative
|
|||
> intervalLit = try (keyword_ "interval" >> do
|
||||
> s <- optionMaybe $ choice [True <$ symbol_ "+"
|
||||
> ,False <$ symbol_ "-"]
|
||||
> lit <- stringToken
|
||||
> lit <- stringTok
|
||||
> q <- optionMaybe intervalQualifier
|
||||
> mkIt s lit q)
|
||||
> where
|
||||
|
@ -701,7 +700,7 @@ all the value expressions which start with an identifier
|
|||
> idenExpr :: Parser ValueExpr
|
||||
> idenExpr =
|
||||
> -- todo: work out how to left factor this
|
||||
> try (TypedLit <$> typeName <*> stringToken)
|
||||
> try (TypedLit <$> typeName <*> stringTokExtend)
|
||||
> <|> (names <**> option Iden app)
|
||||
|
||||
=== special
|
||||
|
@ -731,9 +730,11 @@ operatorname(firstArg keyword0 arg0 keyword1 arg1 etc.)
|
|||
> e <- valueExpr
|
||||
> -- check we haven't parsed the first
|
||||
> -- keyword as an identifier
|
||||
> guard (case (e,kws) of
|
||||
> (Iden [Name i], (k,_):_) | map toLower i == k -> False
|
||||
> _ -> True)
|
||||
> case (e,kws) of
|
||||
> (Iden [Name i], (k,_):_)
|
||||
> | map toLower i == k ->
|
||||
> fail $ "cannot use keyword here: " ++ i
|
||||
> _ -> return ()
|
||||
> pure e
|
||||
> fa <- case firstArg of
|
||||
> SOKNone -> pure Nothing
|
||||
|
@ -806,7 +807,7 @@ in the source
|
|||
> keyword "trim" >>
|
||||
> parens (mkTrim
|
||||
> <$> option "both" sides
|
||||
> <*> option " " stringToken
|
||||
> <*> option " " stringTok
|
||||
> <*> (keyword_ "from" *> valueExpr))
|
||||
> where
|
||||
> sides = choice ["leading" <$ keyword_ "leading"
|
||||
|
@ -986,13 +987,28 @@ a match (select a from t)
|
|||
|
||||
=== escape
|
||||
|
||||
It is going to be really difficult to support an arbitrary character
|
||||
for the escape now there is a separate lexer ...
|
||||
|
||||
> escapeSuffix :: Parser (ValueExpr -> ValueExpr)
|
||||
> escapeSuffix = do
|
||||
> ctor <- choice
|
||||
> [Escape <$ keyword_ "escape"
|
||||
> ,UEscape <$ keyword_ "uescape"]
|
||||
> c <- anyChar
|
||||
> c <- escapeChar
|
||||
> pure $ \v -> ctor v c
|
||||
> where
|
||||
> escapeChar = escapeIden <|> escapeSym
|
||||
> escapeIden = do
|
||||
> c <- identifierTok
|
||||
> case c of
|
||||
> [c'] -> return c'
|
||||
> _ -> fail "escape char must be single char"
|
||||
> escapeSym = do
|
||||
> c <- symbolTok
|
||||
> case c of
|
||||
> [c'] -> return c'
|
||||
> _ -> fail "escape char must be single char"
|
||||
|
||||
=== collate
|
||||
|
||||
|
@ -1018,7 +1034,7 @@ syntax is way too messy. It might be possible to avoid this if we
|
|||
wanted to avoid extensibility and to not be concerned with parse error
|
||||
messages, but both of these are too important.
|
||||
|
||||
> opTable :: Bool -> [[E.Operator String ParseState Identity ValueExpr]]
|
||||
> opTable :: Bool -> [[E.Operator [Token] ParseState Identity ValueExpr]]
|
||||
> opTable bExpr =
|
||||
> [-- parse match and quantified comparisons as postfix ops
|
||||
> -- todo: left factor the quantified comparison with regular
|
||||
|
@ -1452,163 +1468,134 @@ thick.
|
|||
|
||||
------------------------------------------------
|
||||
|
||||
= lexing parsers
|
||||
= lexing
|
||||
|
||||
whitespace parser which skips comments also
|
||||
TODO: push checks into here:
|
||||
keyword blacklists
|
||||
unsigned integer match
|
||||
symbol matching
|
||||
keyword matching
|
||||
|
||||
> whitespace :: Parser ()
|
||||
> whitespace =
|
||||
> choice [simpleWhitespace *> whitespace
|
||||
> ,lineComment *> whitespace
|
||||
> ,blockComment *> whitespace
|
||||
> ,pure ()] <?> "whitespace"
|
||||
> csSqlStringLitTok :: Parser (String,String)
|
||||
> csSqlStringLitTok = mytoken (\tok ->
|
||||
> case tok of
|
||||
> L.CSSqlString p s -> Just (p,s)
|
||||
> _ -> Nothing)
|
||||
|
||||
> stringTok :: Parser String
|
||||
> stringTok = mytoken (\tok ->
|
||||
> case tok of
|
||||
> L.SqlString s -> Just s
|
||||
> _ -> Nothing)
|
||||
|
||||
This is to support SQL strings where you can write
|
||||
'part of a string' ' another part'
|
||||
and it will parse as a single string
|
||||
|
||||
> stringTokExtend :: Parser String
|
||||
> stringTokExtend = do
|
||||
> x <- stringTok
|
||||
> choice [
|
||||
> ((x++) <$> stringTokExtend)
|
||||
> ,return x
|
||||
> ]
|
||||
|
||||
> hostParamTok :: Parser String
|
||||
> hostParamTok = mytoken (\tok ->
|
||||
> case tok of
|
||||
> L.HostParam p -> Just p
|
||||
> _ -> Nothing)
|
||||
|
||||
> sqlNumberTok :: Parser String
|
||||
> sqlNumberTok = mytoken (\tok ->
|
||||
> case tok of
|
||||
> L.SqlNumber p -> Just p
|
||||
> _ -> Nothing)
|
||||
|
||||
|
||||
> symbolTok :: Parser String
|
||||
> symbolTok = mytoken (\tok ->
|
||||
> case tok of
|
||||
> L.Symbol p -> Just p
|
||||
> _ -> Nothing)
|
||||
|
||||
> identifierTok :: Parser String
|
||||
> identifierTok = mytoken (\tok ->
|
||||
> case tok of
|
||||
> L.Identifier p -> Just p
|
||||
> _ -> Nothing)
|
||||
|
||||
> qidentifierTok :: Parser String
|
||||
> qidentifierTok = mytoken (\tok ->
|
||||
> case tok of
|
||||
> L.QIdentifier p -> Just p
|
||||
> _ -> Nothing)
|
||||
|
||||
> dqidentifierTok :: Parser (String,String,String)
|
||||
> dqidentifierTok = mytoken (\tok ->
|
||||
> case tok of
|
||||
> L.DQIdentifier s e t -> Just (s,e,t)
|
||||
> _ -> Nothing)
|
||||
|
||||
> uqidentifierTok :: Parser String
|
||||
> uqidentifierTok = mytoken (\tok ->
|
||||
> case tok of
|
||||
> L.UQIdentifier p -> Just p
|
||||
> _ -> Nothing)
|
||||
|
||||
|
||||
> mytoken :: (L.Token -> Maybe a) -> Parser a
|
||||
> mytoken test = token showToken posToken testToken
|
||||
> where
|
||||
> lineComment = try (string "--")
|
||||
> *> manyTill anyChar (void (char '\n') <|> eof)
|
||||
> blockComment = -- no nesting of block comments in SQL
|
||||
> try (string "/*")
|
||||
> -- try used here so it doesn't fail when we see a
|
||||
> -- '*' which isn't followed by a '/'
|
||||
> *> manyTill anyChar (try $ string "*/")
|
||||
> -- use many1 so we can more easily avoid non terminating loops
|
||||
> simpleWhitespace = void $ many1 (oneOf " \t\n")
|
||||
|
||||
> lexeme :: Parser a -> Parser a
|
||||
> lexeme p = p <* whitespace
|
||||
> showToken (_,tok) = show tok
|
||||
> posToken ((a,b,c),_) = newPos a b c
|
||||
> testToken (_,tok) = test tok
|
||||
|
||||
> unsignedInteger :: Parser Integer
|
||||
> unsignedInteger = read <$> lexeme (many1 digit) <?> "integer"
|
||||
|
||||
|
||||
number literals
|
||||
|
||||
here is the rough grammar target:
|
||||
|
||||
digits
|
||||
digits.[digits][e[+-]digits]
|
||||
[digits].digits[e[+-]digits]
|
||||
digitse[+-]digits
|
||||
|
||||
numbers are parsed to strings, not to a numeric type. This is to avoid
|
||||
making a decision on how to represent numbers, the client code can
|
||||
make this choice.
|
||||
|
||||
> numberLiteral :: Parser String
|
||||
> numberLiteral = lexeme (
|
||||
> (int <??> (pp dot <??.> pp int)
|
||||
> <|> (++) <$> dot <*> int)
|
||||
> <??> pp expon)
|
||||
> where
|
||||
> int = many1 digit
|
||||
> dot = string "."
|
||||
> expon = (:) <$> oneOf "eE" <*> sInt
|
||||
> sInt = (++) <$> option "" (string "+" <|> string "-") <*> int
|
||||
> pp = (<$$> (++))
|
||||
|
||||
|
||||
> identifier :: Parser String
|
||||
> identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar)
|
||||
> <?> "identifier"
|
||||
> where
|
||||
> firstChar = letter <|> char '_' <?> "identifier"
|
||||
> nonFirstChar = digit <|> firstChar <?> ""
|
||||
|
||||
> quotedIdentifier :: Parser String
|
||||
> quotedIdentifier = quotedIdenHelper
|
||||
|
||||
> quotedIdenHelper :: Parser String
|
||||
> quotedIdenHelper =
|
||||
> lexeme (dq *> manyTill anyChar dq >>= optionSuffix moreIden)
|
||||
> <?> "identifier"
|
||||
> where
|
||||
> moreIden s0 = do
|
||||
> void dq
|
||||
> s <- manyTill anyChar dq
|
||||
> optionSuffix moreIden (s0 ++ "\"" ++ s)
|
||||
> dq = char '"' <?> "double quote"
|
||||
|
||||
> uquotedIdentifier :: Parser String
|
||||
> uquotedIdentifier =
|
||||
> try (string "u&" <|> string "U&") *> quotedIdenHelper
|
||||
> <?> "identifier"
|
||||
|
||||
parses an identifier with a : prefix. The : isn't included in the
|
||||
return value
|
||||
|
||||
> hostParameterToken :: Parser String
|
||||
> hostParameterToken = lexeme $ char ':' *> identifier
|
||||
> unsignedInteger = try (do
|
||||
> x <- sqlNumberTok
|
||||
> guard (all isDigit x)
|
||||
> return $ read x
|
||||
> ) <?> "integer"
|
||||
|
||||
todo: work out the symbol parsing better
|
||||
|
||||
> symbol :: String -> Parser String
|
||||
> symbol s = try (lexeme $ do
|
||||
> u <- choice (many1 (char '.') :
|
||||
> map (try . string) [">=","<=","!=","<>","||"]
|
||||
> ++ map (string . (:[])) "+-^*/%~&|<>=")
|
||||
> guard (s == u)
|
||||
> pure s)
|
||||
> <?> s
|
||||
> symbol s = try (do
|
||||
> u <- symbolTok
|
||||
> guard (s == u)
|
||||
> pure s) <?> s
|
||||
|
||||
> singleCharSymbol :: Char -> Parser Char
|
||||
> singleCharSymbol c = c <$ symbol [c]
|
||||
|
||||
> questionMark :: Parser Char
|
||||
> questionMark = lexeme (char '?') <?> "question mark"
|
||||
> questionMark = singleCharSymbol '?' <?> "question mark"
|
||||
|
||||
> openParen :: Parser Char
|
||||
> openParen = lexeme $ char '('
|
||||
> openParen = singleCharSymbol '('
|
||||
|
||||
> closeParen :: Parser Char
|
||||
> closeParen = lexeme $ char ')'
|
||||
> closeParen = singleCharSymbol ')'
|
||||
|
||||
> openBracket :: Parser Char
|
||||
> openBracket = lexeme $ char '['
|
||||
> openBracket = singleCharSymbol '['
|
||||
|
||||
> closeBracket :: Parser Char
|
||||
> closeBracket = lexeme $ char ']'
|
||||
> closeBracket = singleCharSymbol ']'
|
||||
|
||||
|
||||
> comma :: Parser Char
|
||||
> comma = lexeme (char ',') <?> "comma"
|
||||
> comma = singleCharSymbol ','
|
||||
|
||||
> semi :: Parser Char
|
||||
> semi = lexeme (char ';') <?> "semicolon"
|
||||
|
||||
> quote :: Parser Char
|
||||
> quote = lexeme (char '\'') <?> "single quote"
|
||||
|
||||
> --stringToken :: Parser String
|
||||
> --stringToken = lexeme (char '\'' *> manyTill anyChar (char '\''))
|
||||
> -- todo: tidy this up, add the prefixes stuff, and add the multiple
|
||||
> -- string stuff
|
||||
> stringToken :: Parser String
|
||||
> stringToken =
|
||||
> lexeme (nlquote *> manyTill anyChar nlquote
|
||||
> >>= optionSuffix moreString)
|
||||
> <?> "string"
|
||||
> where
|
||||
> moreString s0 = choice
|
||||
> [-- handle two adjacent quotes
|
||||
> do
|
||||
> void nlquote
|
||||
> s <- manyTill anyChar nlquote
|
||||
> optionSuffix moreString (s0 ++ "'" ++ s)
|
||||
> ,-- handle string in separate parts
|
||||
> -- e.g. 'part 1' 'part 2'
|
||||
> do --can this whitespace be factored out?
|
||||
> -- since it will be parsed twice when there is no more literal
|
||||
> -- yes: split the adjacent quote and multiline literal
|
||||
> -- into two different suffixes
|
||||
> -- won't need to call lexeme at the top level anymore after this
|
||||
> try (whitespace <* nlquote)
|
||||
> s <- manyTill anyChar nlquote
|
||||
> optionSuffix moreString (s0 ++ s)
|
||||
> ]
|
||||
> -- non lexeme quote
|
||||
> nlquote = char '\'' <?> "single quote"
|
||||
> semi = singleCharSymbol ';'
|
||||
|
||||
= helper functions
|
||||
|
||||
> keyword :: String -> Parser String
|
||||
> keyword k = try (do
|
||||
> i <- identifier
|
||||
> i <- identifierTok
|
||||
> guard (map toLower i == k)
|
||||
> pure k) <?> k
|
||||
|
||||
|
@ -1638,7 +1625,7 @@ helper function to improve error messages
|
|||
|
||||
> identifierBlacklist :: [String] -> Parser String
|
||||
> identifierBlacklist bl = try (do
|
||||
> i <- identifier
|
||||
> i <- identifierTok
|
||||
> when (map toLower i `elem` bl) $
|
||||
> fail $ "keyword not allowed here: " ++ i
|
||||
> pure i)
|
||||
|
@ -2001,7 +1988,10 @@ different parsers can be used for different dialects
|
|||
|
||||
> type ParseState = Dialect
|
||||
|
||||
> type Parser = Parsec String ParseState
|
||||
> type Token = (L.Position,L.Token)
|
||||
|
||||
> --type Parser = Parsec String ParseState
|
||||
> type Parser = GenParser Token ParseState
|
||||
|
||||
> guardDialect :: [Dialect] -> Parser ()
|
||||
> guardDialect ds = do
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue