diff --git a/Language/SQL/SimpleSQL/Lexer.lhs b/Language/SQL/SimpleSQL/Lexer.lhs index 1409e1c..35a94ac 100644 --- a/Language/SQL/SimpleSQL/Lexer.lhs +++ b/Language/SQL/SimpleSQL/Lexer.lhs @@ -1,9 +1,12 @@ -Lexer TODO: +The parser uses a separate lexer for two reasons: -left factor to get rid of trys +1. sql syntax is very awkward to parse, the separate lexer makes it +easier to handle this in most places (in some places it makes it +harder or impossible, the fix is to switch to something better than +parsec -add some notes on why there is a separate lexer. +2. using a separate lexer gives a huge speed boost > -- | This is the module contains a Lexer for SQL. > {-# LANGUAGE TupleSections #-} @@ -86,9 +89,8 @@ add some notes on why there is a separate lexer. -> -- | Accurate pretty printing, if you lex a bunch of tokens, -> -- then pretty print them, should should get back exactly the -> -- same string +> -- | Pretty printing, if you lex a bunch of tokens, then pretty +> -- print them, should should get back exactly the same string > prettyToken :: Dialect -> Token -> String > prettyToken _ (Symbol s) = s > prettyToken _ (Identifier t) = t @@ -116,6 +118,7 @@ add some notes on why there is a separate lexer. TODO: try to make all parsers applicative only +> -- | Lex some SQL to a list of tokens. > lexSQL :: Dialect > -- ^ dialect of SQL to use > -> FilePath @@ -127,7 +130,7 @@ TODO: try to make all parsers applicative only > -- ^ the SQL source to lex > -> Either ParseError [((String,Int,Int),Token)] > lexSQL dialect fn p src = -> let (l,c) = fromMaybe (1,0) p +> let (l,c) = fromMaybe (1,1) p > in either (Left . convParseError src) Right > $ runParser (setPos (l,c) *> many (sqlToken dialect) <* eof) () fn src > where @@ -148,20 +151,6 @@ TODO: try to make all parsers applicative only > ,symbol d > ,sqlWhitespace d] -> takeWhile1 :: (Char -> Bool) -> Parser String -> takeWhile1 p = many1 (satisfy p) - -> takeWhile :: (Char -> Bool) -> Parser String -> takeWhile p = many (satisfy p) - -> takeTill :: (Char -> Bool) -> Parser String -> takeTill p = -> try (manyTill anyChar (peekSatisfy p)) - -> peekSatisfy :: (Char -> Bool) -> Parser () -> peekSatisfy p = do -> void $ try $ lookAhead (satisfy p) - > identifier :: Dialect -> Parser Token > identifier d = > choice @@ -255,13 +244,8 @@ character symbols in the two lists below. > lineComment :: Dialect -> Parser Token > lineComment _ = > (\s -> LineComment $ concat ["--",s]) <$> -> (try (string "--") *> choice -> [flip snoc '\n' <$> takeTill (=='\n') <* char '\n' -> ,takeWhile (/='\n') <* eof -> ]) -> where -> snoc :: String -> Char -> String -> snoc s a = s ++ [a] +> (try (string "--") *> +> manyTill anyChar (void (char '\n') <|> eof)) > blockComment :: Dialect -> Parser Token > blockComment _ = @@ -281,7 +265,7 @@ character symbols in the two lists below. > -- nested comment, recurse > ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1)) > -- not an end comment or nested comment, continue -> ,(:) <$> anyChar <*> commentSuffix n] +> ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n] > startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String @@ -289,3 +273,17 @@ character symbols in the two lists below. > c <- satisfy p > choice [(:) c <$> (takeWhile1 ps) > ,return [c]] + +> takeWhile1 :: (Char -> Bool) -> Parser String +> takeWhile1 p = many1 (satisfy p) + +> takeWhile :: (Char -> Bool) -> Parser String +> takeWhile p = many (satisfy p) + +> takeTill :: (Char -> Bool) -> Parser String +> takeTill p = +> manyTill anyChar (peekSatisfy p) + +> peekSatisfy :: (Char -> Bool) -> Parser () +> peekSatisfy p = do +> void $ lookAhead (satisfy p) diff --git a/Language/SQL/SimpleSQL/Parser.lhs b/Language/SQL/SimpleSQL/Parser.lhs index 5ad8fc2..a766f8e 100644 --- a/Language/SQL/SimpleSQL/Parser.lhs +++ b/Language/SQL/SimpleSQL/Parser.lhs @@ -183,7 +183,7 @@ fixing them in the syntax but leaving them till the semantic checking > ,ParseError(..)) where > import Control.Monad.Identity (Identity) -> import Control.Monad (guard, void, when) +> import Control.Monad (guard, void) > import Control.Applicative ((<$), (<$>), (<*>) ,(<*), (*>), (<**>), pure) > import Data.Char (toLower, isDigit) > import Text.Parsec (setPosition,setSourceColumn,setSourceLine,getPosition @@ -261,7 +261,7 @@ converts the error return to the nice wrapper > -> String > -> Either ParseError a > wrapParse parser d f p src = do -> let (l,c) = fromMaybe (1,0) p +> let (l,c) = fromMaybe (1,1) p > lx <- L.lexSQL d f (Just (l,c)) src > either (Left . convParseError src) Right > $ runParser (setPos p *> parser <* eof) @@ -316,7 +316,7 @@ u&"example quoted" > d <- getState > choice [QName <$> qidentifierTok > ,UQName <$> uqidentifierTok -> ,Name <$> identifierBlacklist (blacklist d) +> ,Name <$> identifierTok (blacklist d) Nothing > ,(\(s,e,t) -> DQName s e t) <$> dqidentifierTok > ] @@ -545,7 +545,7 @@ See the stringToken lexer below for notes on string literal syntax. > stringLit = StringLit <$> stringTokExtend > numberLit :: Parser ValueExpr -> numberLit = NumLit <$> sqlNumberTok +> numberLit = NumLit <$> sqlNumberTok False > characterSetLit :: Parser ValueExpr > characterSetLit = uncurry CSStringLit <$> csSqlStringLitTok @@ -999,7 +999,7 @@ for the escape now there is a separate lexer ... > pure $ \v -> ctor v c > where > escapeChar :: Parser Char -> escapeChar = (identifierTok <|> symbolTok) >>= oneOnly +> escapeChar = (identifierTok [] Nothing <|> symbolTok Nothing) >>= oneOnly > oneOnly :: String -> Parser Char > oneOnly c = case c of > [c'] -> return c' @@ -1501,23 +1501,25 @@ and it will parse as a single string > L.HostParam p -> Just p > _ -> Nothing) -> sqlNumberTok :: Parser String -> sqlNumberTok = mytoken (\tok -> +> sqlNumberTok :: Bool -> Parser String +> sqlNumberTok intOnly = mytoken (\tok -> > case tok of -> L.SqlNumber p -> Just p +> L.SqlNumber p | not intOnly || all isDigit p -> Just p > _ -> Nothing) -> symbolTok :: Parser String -> symbolTok = mytoken (\tok -> -> case tok of -> L.Symbol p -> Just p +> symbolTok :: Maybe String -> Parser String +> symbolTok sym = mytoken (\tok -> +> case (sym,tok) of +> (Nothing, L.Symbol p) -> Just p +> (Just s, L.Symbol p) | s == p -> Just p > _ -> Nothing) -> identifierTok :: Parser String -> identifierTok = mytoken (\tok -> -> case tok of -> L.Identifier p -> Just p +> identifierTok :: [String] -> Maybe String -> Parser String +> identifierTok blackList kw = mytoken (\tok -> +> case (kw,tok) of +> (Nothing, L.Identifier p) | map toLower p `notElem` blackList -> Just p +> (Just k, L.Identifier p) | k == map toLower p -> Just p > _ -> Nothing) > qidentifierTok :: Parser String @@ -1547,19 +1549,12 @@ and it will parse as a single string > testToken (_,tok) = test tok > unsignedInteger :: Parser Integer -> unsignedInteger = try (do -> x <- sqlNumberTok -> guard (all isDigit x) -> return $ read x -> ) "integer" +> unsignedInteger = read <$> sqlNumberTok True "natural number" todo: work out the symbol parsing better > symbol :: String -> Parser String -> symbol s = try (do -> u <- symbolTok -> guard (s == u) -> pure s) s +> symbol s = symbolTok (Just s) s > singleCharSymbol :: Char -> Parser Char > singleCharSymbol c = c <$ symbol [c] @@ -1589,10 +1584,7 @@ todo: work out the symbol parsing better = helper functions > keyword :: String -> Parser String -> keyword k = try (do -> i <- identifierTok -> guard (map toLower i == k) -> pure k) k +> keyword k = identifierTok [] (Just k) k helper function to improve error messages @@ -1618,14 +1610,6 @@ helper function to improve error messages > commaSep1 :: Parser a -> Parser [a] > commaSep1 = (`sepBy1` comma) -> identifierBlacklist :: [String] -> Parser String -> identifierBlacklist bl = try (do -> i <- identifierTok -> when (map toLower i `elem` bl) $ -> fail $ "keyword not allowed here: " ++ i -> pure i) -> "identifier" - > blacklist :: Dialect -> [String] > blacklist = reservedWord diff --git a/tools/Language/SQL/SimpleSQL/LexerTests.lhs b/tools/Language/SQL/SimpleSQL/LexerTests.lhs index d328dde..4e33cc1 100644 --- a/tools/Language/SQL/SimpleSQL/LexerTests.lhs +++ b/tools/Language/SQL/SimpleSQL/LexerTests.lhs @@ -51,8 +51,7 @@ Test for the lexer > -- block comment > ++ map (\c -> (c, [BlockComment c])) > ["/**/", "/* */","/* this is a comment */" -> -- todo: bug in the lexer, fix this -> --,"/* this *is/ a comment */" +> ,"/* this *is/ a comment */" > ]