small fixes
fix positions? small fixes to haddock add notes to top of lexer module simplify line comment lexer remove some trys from lexer fix the block comment parser to return all the comment text when there are embedded / * in the comment refactor the symbol, keyword and identifier blacklist checking into the low level token parsers instead of a separate step using guard
This commit is contained in:
parent
900d19f4c1
commit
bbb793c160
|
@ -1,9 +1,12 @@
|
||||||
|
|
||||||
Lexer TODO:
|
The parser uses a separate lexer for two reasons:
|
||||||
|
|
||||||
left factor to get rid of trys
|
1. sql syntax is very awkward to parse, the separate lexer makes it
|
||||||
|
easier to handle this in most places (in some places it makes it
|
||||||
|
harder or impossible, the fix is to switch to something better than
|
||||||
|
parsec
|
||||||
|
|
||||||
add some notes on why there is a separate lexer.
|
2. using a separate lexer gives a huge speed boost
|
||||||
|
|
||||||
> -- | This is the module contains a Lexer for SQL.
|
> -- | This is the module contains a Lexer for SQL.
|
||||||
> {-# LANGUAGE TupleSections #-}
|
> {-# LANGUAGE TupleSections #-}
|
||||||
|
@ -86,9 +89,8 @@ add some notes on why there is a separate lexer.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
> -- | Accurate pretty printing, if you lex a bunch of tokens,
|
> -- | Pretty printing, if you lex a bunch of tokens, then pretty
|
||||||
> -- then pretty print them, should should get back exactly the
|
> -- print them, should should get back exactly the same string
|
||||||
> -- same string
|
|
||||||
> prettyToken :: Dialect -> Token -> String
|
> prettyToken :: Dialect -> Token -> String
|
||||||
> prettyToken _ (Symbol s) = s
|
> prettyToken _ (Symbol s) = s
|
||||||
> prettyToken _ (Identifier t) = t
|
> prettyToken _ (Identifier t) = t
|
||||||
|
@ -116,6 +118,7 @@ add some notes on why there is a separate lexer.
|
||||||
|
|
||||||
TODO: try to make all parsers applicative only
|
TODO: try to make all parsers applicative only
|
||||||
|
|
||||||
|
> -- | Lex some SQL to a list of tokens.
|
||||||
> lexSQL :: Dialect
|
> lexSQL :: Dialect
|
||||||
> -- ^ dialect of SQL to use
|
> -- ^ dialect of SQL to use
|
||||||
> -> FilePath
|
> -> FilePath
|
||||||
|
@ -127,7 +130,7 @@ TODO: try to make all parsers applicative only
|
||||||
> -- ^ the SQL source to lex
|
> -- ^ the SQL source to lex
|
||||||
> -> Either ParseError [((String,Int,Int),Token)]
|
> -> Either ParseError [((String,Int,Int),Token)]
|
||||||
> lexSQL dialect fn p src =
|
> lexSQL dialect fn p src =
|
||||||
> let (l,c) = fromMaybe (1,0) p
|
> let (l,c) = fromMaybe (1,1) p
|
||||||
> in either (Left . convParseError src) Right
|
> in either (Left . convParseError src) Right
|
||||||
> $ runParser (setPos (l,c) *> many (sqlToken dialect) <* eof) () fn src
|
> $ runParser (setPos (l,c) *> many (sqlToken dialect) <* eof) () fn src
|
||||||
> where
|
> where
|
||||||
|
@ -148,20 +151,6 @@ TODO: try to make all parsers applicative only
|
||||||
> ,symbol d
|
> ,symbol d
|
||||||
> ,sqlWhitespace d]
|
> ,sqlWhitespace d]
|
||||||
|
|
||||||
> takeWhile1 :: (Char -> Bool) -> Parser String
|
|
||||||
> takeWhile1 p = many1 (satisfy p)
|
|
||||||
|
|
||||||
> takeWhile :: (Char -> Bool) -> Parser String
|
|
||||||
> takeWhile p = many (satisfy p)
|
|
||||||
|
|
||||||
> takeTill :: (Char -> Bool) -> Parser String
|
|
||||||
> takeTill p =
|
|
||||||
> try (manyTill anyChar (peekSatisfy p))
|
|
||||||
|
|
||||||
> peekSatisfy :: (Char -> Bool) -> Parser ()
|
|
||||||
> peekSatisfy p = do
|
|
||||||
> void $ try $ lookAhead (satisfy p)
|
|
||||||
|
|
||||||
> identifier :: Dialect -> Parser Token
|
> identifier :: Dialect -> Parser Token
|
||||||
> identifier d =
|
> identifier d =
|
||||||
> choice
|
> choice
|
||||||
|
@ -255,13 +244,8 @@ character symbols in the two lists below.
|
||||||
> lineComment :: Dialect -> Parser Token
|
> lineComment :: Dialect -> Parser Token
|
||||||
> lineComment _ =
|
> lineComment _ =
|
||||||
> (\s -> LineComment $ concat ["--",s]) <$>
|
> (\s -> LineComment $ concat ["--",s]) <$>
|
||||||
> (try (string "--") *> choice
|
> (try (string "--") *>
|
||||||
> [flip snoc '\n' <$> takeTill (=='\n') <* char '\n'
|
> manyTill anyChar (void (char '\n') <|> eof))
|
||||||
> ,takeWhile (/='\n') <* eof
|
|
||||||
> ])
|
|
||||||
> where
|
|
||||||
> snoc :: String -> Char -> String
|
|
||||||
> snoc s a = s ++ [a]
|
|
||||||
|
|
||||||
> blockComment :: Dialect -> Parser Token
|
> blockComment :: Dialect -> Parser Token
|
||||||
> blockComment _ =
|
> blockComment _ =
|
||||||
|
@ -281,7 +265,7 @@ character symbols in the two lists below.
|
||||||
> -- nested comment, recurse
|
> -- nested comment, recurse
|
||||||
> ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
|
> ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
|
||||||
> -- not an end comment or nested comment, continue
|
> -- not an end comment or nested comment, continue
|
||||||
> ,(:) <$> anyChar <*> commentSuffix n]
|
> ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
|
||||||
|
|
||||||
|
|
||||||
> startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
|
> startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
|
||||||
|
@ -289,3 +273,17 @@ character symbols in the two lists below.
|
||||||
> c <- satisfy p
|
> c <- satisfy p
|
||||||
> choice [(:) c <$> (takeWhile1 ps)
|
> choice [(:) c <$> (takeWhile1 ps)
|
||||||
> ,return [c]]
|
> ,return [c]]
|
||||||
|
|
||||||
|
> takeWhile1 :: (Char -> Bool) -> Parser String
|
||||||
|
> takeWhile1 p = many1 (satisfy p)
|
||||||
|
|
||||||
|
> takeWhile :: (Char -> Bool) -> Parser String
|
||||||
|
> takeWhile p = many (satisfy p)
|
||||||
|
|
||||||
|
> takeTill :: (Char -> Bool) -> Parser String
|
||||||
|
> takeTill p =
|
||||||
|
> manyTill anyChar (peekSatisfy p)
|
||||||
|
|
||||||
|
> peekSatisfy :: (Char -> Bool) -> Parser ()
|
||||||
|
> peekSatisfy p = do
|
||||||
|
> void $ lookAhead (satisfy p)
|
||||||
|
|
|
@ -183,7 +183,7 @@ fixing them in the syntax but leaving them till the semantic checking
|
||||||
> ,ParseError(..)) where
|
> ,ParseError(..)) where
|
||||||
|
|
||||||
> import Control.Monad.Identity (Identity)
|
> import Control.Monad.Identity (Identity)
|
||||||
> import Control.Monad (guard, void, when)
|
> import Control.Monad (guard, void)
|
||||||
> import Control.Applicative ((<$), (<$>), (<*>) ,(<*), (*>), (<**>), pure)
|
> import Control.Applicative ((<$), (<$>), (<*>) ,(<*), (*>), (<**>), pure)
|
||||||
> import Data.Char (toLower, isDigit)
|
> import Data.Char (toLower, isDigit)
|
||||||
> import Text.Parsec (setPosition,setSourceColumn,setSourceLine,getPosition
|
> import Text.Parsec (setPosition,setSourceColumn,setSourceLine,getPosition
|
||||||
|
@ -261,7 +261,7 @@ converts the error return to the nice wrapper
|
||||||
> -> String
|
> -> String
|
||||||
> -> Either ParseError a
|
> -> Either ParseError a
|
||||||
> wrapParse parser d f p src = do
|
> wrapParse parser d f p src = do
|
||||||
> let (l,c) = fromMaybe (1,0) p
|
> let (l,c) = fromMaybe (1,1) p
|
||||||
> lx <- L.lexSQL d f (Just (l,c)) src
|
> lx <- L.lexSQL d f (Just (l,c)) src
|
||||||
> either (Left . convParseError src) Right
|
> either (Left . convParseError src) Right
|
||||||
> $ runParser (setPos p *> parser <* eof)
|
> $ runParser (setPos p *> parser <* eof)
|
||||||
|
@ -316,7 +316,7 @@ u&"example quoted"
|
||||||
> d <- getState
|
> d <- getState
|
||||||
> choice [QName <$> qidentifierTok
|
> choice [QName <$> qidentifierTok
|
||||||
> ,UQName <$> uqidentifierTok
|
> ,UQName <$> uqidentifierTok
|
||||||
> ,Name <$> identifierBlacklist (blacklist d)
|
> ,Name <$> identifierTok (blacklist d) Nothing
|
||||||
> ,(\(s,e,t) -> DQName s e t) <$> dqidentifierTok
|
> ,(\(s,e,t) -> DQName s e t) <$> dqidentifierTok
|
||||||
> ]
|
> ]
|
||||||
|
|
||||||
|
@ -545,7 +545,7 @@ See the stringToken lexer below for notes on string literal syntax.
|
||||||
> stringLit = StringLit <$> stringTokExtend
|
> stringLit = StringLit <$> stringTokExtend
|
||||||
|
|
||||||
> numberLit :: Parser ValueExpr
|
> numberLit :: Parser ValueExpr
|
||||||
> numberLit = NumLit <$> sqlNumberTok
|
> numberLit = NumLit <$> sqlNumberTok False
|
||||||
|
|
||||||
> characterSetLit :: Parser ValueExpr
|
> characterSetLit :: Parser ValueExpr
|
||||||
> characterSetLit = uncurry CSStringLit <$> csSqlStringLitTok
|
> characterSetLit = uncurry CSStringLit <$> csSqlStringLitTok
|
||||||
|
@ -999,7 +999,7 @@ for the escape now there is a separate lexer ...
|
||||||
> pure $ \v -> ctor v c
|
> pure $ \v -> ctor v c
|
||||||
> where
|
> where
|
||||||
> escapeChar :: Parser Char
|
> escapeChar :: Parser Char
|
||||||
> escapeChar = (identifierTok <|> symbolTok) >>= oneOnly
|
> escapeChar = (identifierTok [] Nothing <|> symbolTok Nothing) >>= oneOnly
|
||||||
> oneOnly :: String -> Parser Char
|
> oneOnly :: String -> Parser Char
|
||||||
> oneOnly c = case c of
|
> oneOnly c = case c of
|
||||||
> [c'] -> return c'
|
> [c'] -> return c'
|
||||||
|
@ -1501,23 +1501,25 @@ and it will parse as a single string
|
||||||
> L.HostParam p -> Just p
|
> L.HostParam p -> Just p
|
||||||
> _ -> Nothing)
|
> _ -> Nothing)
|
||||||
|
|
||||||
> sqlNumberTok :: Parser String
|
> sqlNumberTok :: Bool -> Parser String
|
||||||
> sqlNumberTok = mytoken (\tok ->
|
> sqlNumberTok intOnly = mytoken (\tok ->
|
||||||
> case tok of
|
> case tok of
|
||||||
> L.SqlNumber p -> Just p
|
> L.SqlNumber p | not intOnly || all isDigit p -> Just p
|
||||||
> _ -> Nothing)
|
> _ -> Nothing)
|
||||||
|
|
||||||
|
|
||||||
> symbolTok :: Parser String
|
> symbolTok :: Maybe String -> Parser String
|
||||||
> symbolTok = mytoken (\tok ->
|
> symbolTok sym = mytoken (\tok ->
|
||||||
> case tok of
|
> case (sym,tok) of
|
||||||
> L.Symbol p -> Just p
|
> (Nothing, L.Symbol p) -> Just p
|
||||||
|
> (Just s, L.Symbol p) | s == p -> Just p
|
||||||
> _ -> Nothing)
|
> _ -> Nothing)
|
||||||
|
|
||||||
> identifierTok :: Parser String
|
> identifierTok :: [String] -> Maybe String -> Parser String
|
||||||
> identifierTok = mytoken (\tok ->
|
> identifierTok blackList kw = mytoken (\tok ->
|
||||||
> case tok of
|
> case (kw,tok) of
|
||||||
> L.Identifier p -> Just p
|
> (Nothing, L.Identifier p) | map toLower p `notElem` blackList -> Just p
|
||||||
|
> (Just k, L.Identifier p) | k == map toLower p -> Just p
|
||||||
> _ -> Nothing)
|
> _ -> Nothing)
|
||||||
|
|
||||||
> qidentifierTok :: Parser String
|
> qidentifierTok :: Parser String
|
||||||
|
@ -1547,19 +1549,12 @@ and it will parse as a single string
|
||||||
> testToken (_,tok) = test tok
|
> testToken (_,tok) = test tok
|
||||||
|
|
||||||
> unsignedInteger :: Parser Integer
|
> unsignedInteger :: Parser Integer
|
||||||
> unsignedInteger = try (do
|
> unsignedInteger = read <$> sqlNumberTok True <?> "natural number"
|
||||||
> x <- sqlNumberTok
|
|
||||||
> guard (all isDigit x)
|
|
||||||
> return $ read x
|
|
||||||
> ) <?> "integer"
|
|
||||||
|
|
||||||
todo: work out the symbol parsing better
|
todo: work out the symbol parsing better
|
||||||
|
|
||||||
> symbol :: String -> Parser String
|
> symbol :: String -> Parser String
|
||||||
> symbol s = try (do
|
> symbol s = symbolTok (Just s) <?> s
|
||||||
> u <- symbolTok
|
|
||||||
> guard (s == u)
|
|
||||||
> pure s) <?> s
|
|
||||||
|
|
||||||
> singleCharSymbol :: Char -> Parser Char
|
> singleCharSymbol :: Char -> Parser Char
|
||||||
> singleCharSymbol c = c <$ symbol [c]
|
> singleCharSymbol c = c <$ symbol [c]
|
||||||
|
@ -1589,10 +1584,7 @@ todo: work out the symbol parsing better
|
||||||
= helper functions
|
= helper functions
|
||||||
|
|
||||||
> keyword :: String -> Parser String
|
> keyword :: String -> Parser String
|
||||||
> keyword k = try (do
|
> keyword k = identifierTok [] (Just k) <?> k
|
||||||
> i <- identifierTok
|
|
||||||
> guard (map toLower i == k)
|
|
||||||
> pure k) <?> k
|
|
||||||
|
|
||||||
helper function to improve error messages
|
helper function to improve error messages
|
||||||
|
|
||||||
|
@ -1618,14 +1610,6 @@ helper function to improve error messages
|
||||||
> commaSep1 :: Parser a -> Parser [a]
|
> commaSep1 :: Parser a -> Parser [a]
|
||||||
> commaSep1 = (`sepBy1` comma)
|
> commaSep1 = (`sepBy1` comma)
|
||||||
|
|
||||||
> identifierBlacklist :: [String] -> Parser String
|
|
||||||
> identifierBlacklist bl = try (do
|
|
||||||
> i <- identifierTok
|
|
||||||
> when (map toLower i `elem` bl) $
|
|
||||||
> fail $ "keyword not allowed here: " ++ i
|
|
||||||
> pure i)
|
|
||||||
> <?> "identifier"
|
|
||||||
|
|
||||||
> blacklist :: Dialect -> [String]
|
> blacklist :: Dialect -> [String]
|
||||||
> blacklist = reservedWord
|
> blacklist = reservedWord
|
||||||
|
|
||||||
|
|
|
@ -51,8 +51,7 @@ Test for the lexer
|
||||||
> -- block comment
|
> -- block comment
|
||||||
> ++ map (\c -> (c, [BlockComment c]))
|
> ++ map (\c -> (c, [BlockComment c]))
|
||||||
> ["/**/", "/* */","/* this is a comment */"
|
> ["/**/", "/* */","/* this is a comment */"
|
||||||
> -- todo: bug in the lexer, fix this
|
> ,"/* this *is/ a comment */"
|
||||||
> --,"/* this *is/ a comment */"
|
|
||||||
> ]
|
> ]
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue