From bbb793c1607d02f17821ea9af753c06d670f244c Mon Sep 17 00:00:00 2001
From: Jake Wheat <jakewheatmail@gmail.com>
Date: Sat, 1 Aug 2015 13:22:07 +0300
Subject: [PATCH] small fixes

fix positions?
small fixes to haddock
add notes to top of lexer module
simplify line comment lexer
remove some trys from lexer
fix the block comment parser to return all the comment text when there
  are embedded / * in the comment
refactor the symbol, keyword and identifier blacklist checking into
  the low level token parsers instead of a separate step using guard
---
 Language/SQL/SimpleSQL/Lexer.lhs            | 56 ++++++++++----------
 Language/SQL/SimpleSQL/Parser.lhs           | 58 ++++++++-------------
 tools/Language/SQL/SimpleSQL/LexerTests.lhs |  3 +-
 3 files changed, 49 insertions(+), 68 deletions(-)

diff --git a/Language/SQL/SimpleSQL/Lexer.lhs b/Language/SQL/SimpleSQL/Lexer.lhs
index 1409e1c..35a94ac 100644
--- a/Language/SQL/SimpleSQL/Lexer.lhs
+++ b/Language/SQL/SimpleSQL/Lexer.lhs
@@ -1,9 +1,12 @@
 
-Lexer TODO:
+The parser uses a separate lexer for two reasons:
 
-left factor to get rid of trys
+1. sql syntax is very awkward to parse, the separate lexer makes it
+easier to handle this in most places (in some places it makes it
+harder or impossible, the fix is to switch to something better than
+parsec
 
-add some notes on why there is a separate lexer.
+2. using a separate lexer gives a huge speed boost
 
 > -- | This is the module contains a Lexer for SQL.
 > {-# LANGUAGE TupleSections #-}
@@ -86,9 +89,8 @@ add some notes on why there is a separate lexer.
 
 
 
-> -- | Accurate pretty printing, if you lex a bunch of tokens,
-> -- then pretty print them, should should get back exactly the
-> -- same string
+> -- | Pretty printing, if you lex a bunch of tokens, then pretty
+> -- print them, should should get back exactly the same string
 > prettyToken :: Dialect -> Token -> String
 > prettyToken _ (Symbol s) = s
 > prettyToken _ (Identifier t) = t
@@ -116,6 +118,7 @@ add some notes on why there is a separate lexer.
 
 TODO: try to make all parsers applicative only
 
+> -- | Lex some SQL to a list of tokens.
 > lexSQL :: Dialect
 >                   -- ^ dialect of SQL to use
 >                -> FilePath
@@ -127,7 +130,7 @@ TODO: try to make all parsers applicative only
 >                   -- ^ the SQL source to lex
 >                -> Either ParseError [((String,Int,Int),Token)]
 > lexSQL dialect fn p src =
->     let (l,c) = fromMaybe (1,0) p
+>     let (l,c) = fromMaybe (1,1) p
 >     in either (Left . convParseError src) Right
 >        $ runParser (setPos (l,c) *> many (sqlToken dialect) <* eof) () fn src
 >   where
@@ -148,20 +151,6 @@ TODO: try to make all parsers applicative only
 >                     ,symbol d
 >                     ,sqlWhitespace d]
 
-> takeWhile1 :: (Char -> Bool) -> Parser String
-> takeWhile1 p = many1 (satisfy p)
-
-> takeWhile :: (Char -> Bool) -> Parser String
-> takeWhile p = many (satisfy p)
-
-> takeTill :: (Char -> Bool) -> Parser String
-> takeTill p =
->     try (manyTill anyChar (peekSatisfy p))
-
-> peekSatisfy :: (Char -> Bool) -> Parser ()
-> peekSatisfy p = do
->     void $ try $ lookAhead (satisfy p)
-
 > identifier :: Dialect -> Parser Token
 > identifier d =
 >     choice
@@ -255,13 +244,8 @@ character symbols in the two lists below.
 > lineComment :: Dialect -> Parser Token
 > lineComment _ =
 >     (\s -> LineComment $ concat ["--",s]) <$>
->     (try (string "--") *> choice
->                     [flip snoc '\n' <$> takeTill (=='\n') <* char '\n'
->                     ,takeWhile (/='\n') <* eof
->                     ])
->   where
->     snoc :: String -> Char -> String
->     snoc s a = s ++ [a]
+>     (try (string "--") *>
+>      manyTill anyChar (void (char '\n') <|> eof))
 
 > blockComment :: Dialect -> Parser Token
 > blockComment _ =
@@ -281,7 +265,7 @@ character symbols in the two lists below.
 >               -- nested comment, recurse
 >              ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
 >               -- not an end comment or nested comment, continue
->              ,(:) <$> anyChar <*> commentSuffix n]
+>              ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
 
 
 > startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
@@ -289,3 +273,17 @@ character symbols in the two lists below.
 >   c <- satisfy p
 >   choice [(:) c <$> (takeWhile1 ps)
 >          ,return [c]]
+
+> takeWhile1 :: (Char -> Bool) -> Parser String
+> takeWhile1 p = many1 (satisfy p)
+
+> takeWhile :: (Char -> Bool) -> Parser String
+> takeWhile p = many (satisfy p)
+
+> takeTill :: (Char -> Bool) -> Parser String
+> takeTill p =
+>     manyTill anyChar (peekSatisfy p)
+
+> peekSatisfy :: (Char -> Bool) -> Parser ()
+> peekSatisfy p = do
+>     void $ lookAhead (satisfy p)
diff --git a/Language/SQL/SimpleSQL/Parser.lhs b/Language/SQL/SimpleSQL/Parser.lhs
index 5ad8fc2..a766f8e 100644
--- a/Language/SQL/SimpleSQL/Parser.lhs
+++ b/Language/SQL/SimpleSQL/Parser.lhs
@@ -183,7 +183,7 @@ fixing them in the syntax but leaving them till the semantic checking
 >     ,ParseError(..)) where
 
 > import Control.Monad.Identity (Identity)
-> import Control.Monad (guard, void, when)
+> import Control.Monad (guard, void)
 > import Control.Applicative ((<$), (<$>), (<*>) ,(<*), (*>), (<**>), pure)
 > import Data.Char (toLower, isDigit)
 > import Text.Parsec (setPosition,setSourceColumn,setSourceLine,getPosition
@@ -261,7 +261,7 @@ converts the error return to the nice wrapper
 >           -> String
 >           -> Either ParseError a
 > wrapParse parser d f p src = do
->     let (l,c) = fromMaybe (1,0) p
+>     let (l,c) = fromMaybe (1,1) p
 >     lx <- L.lexSQL d f (Just (l,c)) src
 >     either (Left . convParseError src) Right
 >       $ runParser (setPos p *> parser <* eof)
@@ -316,7 +316,7 @@ u&"example quoted"
 >     d <- getState
 >     choice [QName <$> qidentifierTok
 >            ,UQName <$> uqidentifierTok
->            ,Name <$> identifierBlacklist (blacklist d)
+>            ,Name <$> identifierTok (blacklist d) Nothing
 >            ,(\(s,e,t) -> DQName s e t) <$> dqidentifierTok
 >            ]
 
@@ -545,7 +545,7 @@ See the stringToken lexer below for notes on string literal syntax.
 > stringLit = StringLit <$> stringTokExtend
 
 > numberLit :: Parser ValueExpr
-> numberLit = NumLit <$> sqlNumberTok
+> numberLit = NumLit <$> sqlNumberTok False
 
 > characterSetLit :: Parser ValueExpr
 > characterSetLit = uncurry CSStringLit <$> csSqlStringLitTok
@@ -999,7 +999,7 @@ for the escape now there is a separate lexer ...
 >     pure $ \v -> ctor v c
 >   where
 >     escapeChar :: Parser Char
->     escapeChar = (identifierTok <|> symbolTok) >>= oneOnly
+>     escapeChar = (identifierTok [] Nothing <|> symbolTok Nothing) >>= oneOnly
 >     oneOnly :: String -> Parser Char
 >     oneOnly c = case c of
 >                    [c'] -> return c'
@@ -1501,23 +1501,25 @@ and it will parse as a single string
 >       L.HostParam p -> Just p
 >       _ -> Nothing)
 
-> sqlNumberTok :: Parser String
-> sqlNumberTok = mytoken (\tok ->
+> sqlNumberTok :: Bool -> Parser String
+> sqlNumberTok intOnly = mytoken (\tok ->
 >     case tok of
->       L.SqlNumber p -> Just p
+>       L.SqlNumber p | not intOnly || all isDigit p -> Just p
 >       _ -> Nothing)
 
 
-> symbolTok :: Parser String
-> symbolTok = mytoken (\tok ->
->     case tok of
->       L.Symbol p -> Just p
+> symbolTok :: Maybe String -> Parser String
+> symbolTok sym = mytoken (\tok ->
+>     case (sym,tok) of
+>       (Nothing, L.Symbol p) -> Just p
+>       (Just s, L.Symbol p) | s == p -> Just p
 >       _ -> Nothing)
 
-> identifierTok :: Parser String
-> identifierTok = mytoken (\tok ->
->     case tok of
->       L.Identifier p -> Just p
+> identifierTok :: [String] -> Maybe String -> Parser String
+> identifierTok blackList kw = mytoken (\tok ->
+>     case (kw,tok) of
+>       (Nothing, L.Identifier p) | map toLower p `notElem` blackList -> Just p
+>       (Just k, L.Identifier p) | k == map toLower p -> Just p
 >       _ -> Nothing)
 
 > qidentifierTok :: Parser String
@@ -1547,19 +1549,12 @@ and it will parse as a single string
 >     testToken (_,tok)   = test tok
 
 > unsignedInteger :: Parser Integer
-> unsignedInteger = try (do
->                    x <- sqlNumberTok
->                    guard (all isDigit x)
->                    return $ read x
->                    ) <?> "integer"
+> unsignedInteger = read <$> sqlNumberTok True <?> "natural number"
 
 todo: work out the symbol parsing better
 
 > symbol :: String -> Parser String
-> symbol s = try (do
->   u <- symbolTok
->   guard (s == u)
->   pure s) <?> s
+> symbol s = symbolTok (Just s) <?> s
 
 > singleCharSymbol :: Char -> Parser Char
 > singleCharSymbol c = c <$ symbol [c]
@@ -1589,10 +1584,7 @@ todo: work out the symbol parsing better
 = helper functions
 
 > keyword :: String -> Parser String
-> keyword k = try (do
->     i <- identifierTok
->     guard (map toLower i == k)
->     pure k) <?> k
+> keyword k = identifierTok [] (Just k) <?> k
 
 helper function to improve error messages
 
@@ -1618,14 +1610,6 @@ helper function to improve error messages
 > commaSep1 :: Parser a -> Parser [a]
 > commaSep1 = (`sepBy1` comma)
 
-> identifierBlacklist :: [String] -> Parser String
-> identifierBlacklist bl = try (do
->     i <- identifierTok
->     when (map toLower i `elem` bl) $
->         fail $ "keyword not allowed here: " ++ i
->     pure i)
->     <?> "identifier"
-
 > blacklist :: Dialect -> [String]
 > blacklist = reservedWord
 
diff --git a/tools/Language/SQL/SimpleSQL/LexerTests.lhs b/tools/Language/SQL/SimpleSQL/LexerTests.lhs
index d328dde..4e33cc1 100644
--- a/tools/Language/SQL/SimpleSQL/LexerTests.lhs
+++ b/tools/Language/SQL/SimpleSQL/LexerTests.lhs
@@ -51,8 +51,7 @@ Test for the lexer
 >     -- block comment
 >     ++ map (\c -> (c, [BlockComment c]))
 >        ["/**/", "/* */","/* this is a comment */"
->         -- todo: bug in the lexer, fix this
->         --,"/* this *is/ a comment */"
+>        ,"/* this *is/ a comment */"
 >        ]