From 9fd2970f26c9b530143af683d69ca63453c4b305 Mon Sep 17 00:00:00 2001
From: Jake Wheat <jakewheatmail@gmail.com>
Date: Mon, 15 Feb 2016 20:31:06 +0200
Subject: [PATCH] work on lexing: error cases

fix so following a number with either . or e or E without whitespace
  will cause a lexing error
remove */ from symbols in postgres (although postgres strictly
  speaking allows this I think, it is not a good idea)
reject */ anywhere with an error
reject ||| (or more pipes) in ansi, etc. dialects instead of trying to
  parse it as something like '||', '|'
---
 Language/SQL/SimpleSQL/Lex.lhs              | 110 ++++++++++++--------
 tools/Language/SQL/SimpleSQL/LexerTests.lhs |  78 ++++++++++----
 tools/Language/SQL/SimpleSQL/TestTypes.lhs  |   3 +-
 tools/Language/SQL/SimpleSQL/Tests.lhs      |  10 +-
 4 files changed, 136 insertions(+), 65 deletions(-)

diff --git a/Language/SQL/SimpleSQL/Lex.lhs b/Language/SQL/SimpleSQL/Lex.lhs
index 302404b..fa4651d 100644
--- a/Language/SQL/SimpleSQL/Lex.lhs
+++ b/Language/SQL/SimpleSQL/Lex.lhs
@@ -176,6 +176,7 @@ this is also tried before symbol (a .1 will be parsed as a number, but
 >                     ,blockComment d
 >                     ,sqlNumber d
 >                     ,positionalArg d
+>                     ,dontParseEndBlockComment d
 >                     ,symbol d
 >                     ,sqlWhitespace d]
 
@@ -312,15 +313,20 @@ considered part of the constant; it is an operator applied to the
 constant.
 
 > sqlNumber :: Dialect -> Parser Token
-> sqlNumber _ = SqlNumber <$>
->     (int <??> (pp dot <??.> pp int)
->      -- try is used in case we read a dot
->      -- and it isn't part of a number
->      -- if there are any following digits, then we commit
->      -- to it being a number and not something else
->      <|> try ((++) <$> dot <*> int))
->     <??> pp expon
+> sqlNumber _ =
+>     SqlNumber <$> completeNumber
+>     -- this is for definitely avoiding possibly ambiguous source
+>     <* notFollowedBy (oneOf "eE.")
 >   where
+>     completeNumber =
+>       (int <??> (pp dot <??.> pp int)
+>       -- try is used in case we read a dot
+>       -- and it isn't part of a number
+>       -- if there are any following digits, then we commit
+>       -- to it being a number and not something else
+>       <|> try ((++) <$> dot <*> int))
+>       <??> pp expon
+
 >     int = many1 digit
 >     dot = string "."
 >     expon = (:) <$> oneOf "eE" <*> sInt
@@ -363,9 +369,6 @@ which allows the last character of a multi character symbol to be + or
 -
 
 >    allOpSymbols = "+-*/<>=~!@#%^&|`?"
->    -- all symbols except - and / which can be used to start
->    -- a comment token
->    allOpSymbolsNoCommentStarters = filter (`notElem` "-/") allOpSymbols
 >    -- these are the symbols when if part of a multi character
 >    -- operator permit the operator to end with a + or - symbol
 >    exceptionOpSymbols = "~!@#%^&|`?"
@@ -380,11 +383,14 @@ which allows the last character of a multi character symbol to be + or
 >    -- operator and we have alread seen one of the 'exception chars'
 >    -- and so we can end with a + or -
 >    moreOpCharsException = do
->        c <- oneOf allOpSymbolsNoCommentStarters
+>        c <- oneOf (filter (`notElem` "-/*") allOpSymbols)
 >             -- make sure we don't parse a comment starting token
 >             -- as part of an operator
 >             <|> try (char '/' <* notFollowedBy (char '*'))
 >             <|> try (char '-' <* notFollowedBy (char '-'))
+>             -- and make sure we don't parse a block comment end
+>             -- as part of another symbol
+>             <|> try (char '*' <* notFollowedBy (char '/'))
 >        (c:) <$> option [] moreOpCharsException
 
 >    opMoreChars = choice
@@ -402,8 +408,10 @@ which allows the last character of a multi character symbol to be + or
 >                   <* lookAhead (oneOf allOpSymbols))
 >              <|> -- parse / check it isn't the start of a /* comment
 >              try (char '/' <* notFollowedBy (char '*'))
+>              <|> -- make sure we don't parse */ as part of a symbol
+>              try (char '*' <* notFollowedBy (char '/'))
 >              <|> -- any other ansi operator symbol
->              oneOf "*<>=")
+>              oneOf "<>=")
 >         <*> option [] opMoreChars
 >        ]
 
@@ -416,9 +424,11 @@ which allows the last character of a multi character symbol to be + or
 try is used because most of the first characters of the two character
 symbols can also be part of a single character symbol
 
->    regularOp = map (try . string) [">=","<=","!=","<>","||"]
->                ++ map (string . (:[])) "+-^*/%~&|<>="
-
+>    regularOp = map (try . string) [">=","<=","!=","<>"]
+>                ++ map (string . (:[])) "+-^*/%~&<>="
+>                ++ [char '|' *>
+>                    choice ["||" <$ char '|' <* notFollowedBy (char '|')
+>                           ,return "|"]]
 
 > symbol _ =
 >    Symbol <$> choice (otherSymbol ++ regularOp)
@@ -429,8 +439,11 @@ symbols can also be part of a single character symbol
 try is used because most of the first characters of the two character
 symbols can also be part of a single character symbol
 
->    regularOp = map (try . string) [">=","<=","!=","<>","||"]
->                ++ map (string . (:[])) "+-^*/%~&|<>="
+>    regularOp = map (try . string) [">=","<=","!=","<>"]
+>                ++ map (string . (:[])) "+-^*/%~&<>=[]"
+>                ++ [char '|' *>
+>                    choice ["||" <$ char '|' <* notFollowedBy (char '|')
+>                           ,return "|"]]
 
 
 
@@ -477,6 +490,18 @@ isn't there.
 >              ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
 
 
+This is to improve user experience: provide an error if we see */
+outside a comment. This could potentially break postgres ops with */
+in (which is a stupid thing to do). In other cases, the user should
+write * / instead (I can't think of any cases when this would be valid
+syntax though).
+
+> dontParseEndBlockComment :: Dialect -> Parser Token
+> dontParseEndBlockComment _ =
+>     -- don't use try, then it should commit to the error
+>     try (string "*/") *> fail "comment end without comment start"
+
+
 Some helper combinators
 
 > startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
@@ -513,6 +538,10 @@ maybe do some quick checking to make sure this function only gives
 true negatives: check pairs which return false actually fail to lex or
 give different symbols in return
 
+a good sanity test for this function is to change it to always return
+true, then check that the automated tests return the same number of
+successes.
+
 > tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool
 > tokenListWillPrintAndLex _ [] = True
 > tokenListWillPrintAndLex _ [_] = True
@@ -535,7 +564,8 @@ two symbols next to eachother will fail if the symbols can combine and
 (possibly just the prefix) look like a different symbol, or if they
 combine to look like comment markers
 
-
+check if the end of one symbol and the start of the next can form a
+comment token
 
 > tokensWillPrintAndLex d a@(Symbol {}) b@(Symbol {})
 >     | a'@(_:_) <- prettyToken d a
@@ -553,17 +583,13 @@ combine to look like comment markers
 >    ,(">","=")
 >    ,("!","=")
 >    ,("|","|")
->    -- todo: make the lexer special case reject ||| and so on
->    -- because it is ambiguous
->    --,("||","|")
+>    ,("||","|")
 >    ,("|","||")
->    --,("||","||")
+>    ,("||","||")
 >    ,("<",">=")
 >    ,("-","-")
 >    ,("/","*")
->    -- todo: special case make the lexer reject */ next to eachother
->    -- because trying to interpret it as *,/ will lead to much worse error messages
->    --,("*","/")
+>    ,("*","/")
 >    ]
 
 two whitespaces will be combined
@@ -593,11 +619,17 @@ some of the symbol
 >        (_:_) -> last s /= '-'
 >        _ -> True
 
-in other situations a trailing line comment or a leading or trailing
-block comment will work
+in other situations a trailing line comment will work
 
 > tokensWillPrintAndLex _ _ LineComment {} = True
-> tokensWillPrintAndLex _ _ BlockComment {} = True
+
+block comments: make sure there isn't a * symbol immediately before the comment opening
+
+> tokensWillPrintAndLex d a BlockComment {} =
+>     case prettyToken d a of
+>         a'@(_:_) | last a' == '*' -> False
+>         _ -> True
+
 > tokensWillPrintAndLex _ BlockComment {} _ = True
 
 
@@ -708,6 +740,11 @@ is an unambiguous parse
 
 TODO:
 
+refactor the tokenswillprintlex to be based on pretty printing the
+individual tokens
+
+start adding negative / different parse dialect tests
+
 lex @variable in sql server
 lex [quoted identifier] in sql server
 lex #variable in oracle
@@ -715,22 +752,11 @@ lex #variable in oracle
 make a new ctor for @var, #var
 
 add token tables and tests for oracle, sql server
+review existing tables
+look for refactoring opportunities
 
 add odbc as a dialect flag and include {} as symbols when enabled
 
-add negative / different parse dialect tests
-
-refactor the tokenswillprintlex to be based on pretty printing the
-individual tokens
-
-add special cases to lexer:
-||| three or more pipes is a syntax error (except for postgres)
-    (and not ||,| or |,|| e.g.)
-extra ., e after number without whitespace is a syntax error
-
-reject a */ pair of chars instead of parsing it as a symbol or several
-symbols. Can this appear inside an operator in postgres? Let's make it
-not possible anywhere since this could be really confusing.
 
 do some user documentation on lexing, and lexing/dialects
 
diff --git a/tools/Language/SQL/SimpleSQL/LexerTests.lhs b/tools/Language/SQL/SimpleSQL/LexerTests.lhs
index 3737091..269d4ec 100644
--- a/tools/Language/SQL/SimpleSQL/LexerTests.lhs
+++ b/tools/Language/SQL/SimpleSQL/LexerTests.lhs
@@ -68,19 +68,38 @@ Test for the lexer
 
 > ansiLexerTests :: TestItem
 > ansiLexerTests = Group "ansiLexerTests" $
->     [Group "ansi lexer token tests" $ [LexerTest ansi2011 s t |  (s,t) <- ansiLexerTable]
+>     [Group "ansi lexer token tests" $ [LexTest ansi2011 s t |  (s,t) <- ansiLexerTable]
 >     ,Group "ansi generated combination lexer tests" $
->     [ LexerTest ansi2011 (s ++ s1) (t ++ t1)
+>     [ LexTest ansi2011 (s ++ s1) (t ++ t1)
 >     | (s,t) <- ansiLexerTable
 >     , (s1,t1) <- ansiLexerTable
 >     , tokenListWillPrintAndLex ansi2011 $ t ++ t1
 
 >     ]
->     ,Group "adhoc lexer tests" $
->        map (uncurry $ LexerTest ansi2011)
+>     ,Group "ansiadhoclexertests" $
+>        map (uncurry $ LexTest ansi2011)
 >        [("", [])
 >        ,("-- line com\nstuff", [LineComment "-- line com\n",Identifier Nothing "stuff"])
->        ]
+>        ] ++
+>        [-- want to make sure this gives a parse error
+>         LexFails ansi2011 "*/"
+>         -- combinations of pipes: make sure they fail because they could be
+>         -- ambiguous and it is really unclear when they are or not, and
+>         -- what the result is even when they are not ambiguous
+>        ,LexFails ansi2011 "|||"
+>        ,LexFails ansi2011 "||||"
+>        ,LexFails ansi2011 "|||||"
+>        -- another user experience thing: make sure extra trailing
+>        -- number chars are rejected rather than attempting to parse
+>        -- if the user means to write something that is rejected by this code,
+>        -- then they can use whitespace to make it clear and then it will parse
+>        ,LexFails ansi2011 "12e3e4"
+>        ,LexFails ansi2011 "12e3e4"
+>        ,LexFails ansi2011 "12e3e4"
+>        ,LexFails ansi2011 "12e3.4"
+>        ,LexFails ansi2011 "12.4.5"
+>        ,LexFails ansi2011 "12.4e5.6"
+>        ,LexFails ansi2011 "12.4e5e7"]
 >      ]
 
 todo: lexing tests
@@ -187,7 +206,7 @@ also: do the testing for the ansi compatibility special cases
 >        [ x
 >        | n <- [1..l]
 >        , x <- combos "+-*/<>=~!@#%^&|`?" n
->        , not ("--" `isInfixOf` x || "/*" `isInfixOf` x)
+>        , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x)
 >        , not (last x `elem` "+-")
 >          || or (map (`elem` x) "~!@#%^&|`?")
 >        ]
@@ -201,7 +220,7 @@ the + or -.
 >        [ x
 >        | n <- [1..l]
 >        , x <- combos "+-*/<>=" n
->        , not ("--" `isInfixOf` x || "/*" `isInfixOf` x)
+>        , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x)
 >        , not (last x `elem` "+-")
 >        ]
 
@@ -209,27 +228,44 @@ the + or -.
 > postgresLexerTests :: TestItem
 > postgresLexerTests = Group "postgresLexerTests" $
 >     [Group "postgres lexer token tests" $
->      [LexerTest postgres s t | (s,t) <- postgresLexerTable ++ postgresExtraOperatorTable]
+>      [LexTest postgres s t | (s,t) <- postgresLexerTable ++ postgresExtraOperatorTable]
 >     ,Group "postgres generated combination lexer tests" $
->     [ LexerTest postgres (s ++ s1) (t ++ t1)
+>     [ LexTest postgres (s ++ s1) (t ++ t1)
 >     | (s,t) <- postgresLexerTable
 >     , (s1,t1) <- postgresLexerTable
 >     , tokenListWillPrintAndLex postgres $ t ++ t1
 
 >     ]
->     ,Group "adhoc postgres lexer tests" $
->      [LexerTest postgres s t
+>     ,Group "adhoc postgres lexertests" $
+>      [LexTest postgres s t
 >      | (s,t) <- edgeCaseCommentOps
 >                 ++ edgeCasePlusMinusOps
 >                 ++ edgeCasePlusMinusComments]
+>      ++
+>       -- need more tests for */ to make sure it is caught if it is in the middle of a
+>       -- sequence of symbol letters
+>         [LexFails postgres "*/"
+>         ,LexFails postgres "@*/"
+>         ,LexFails postgres "-*/"
+>         ,LexFails postgres "12e3e4"
+>         ,LexFails postgres "12e3e4"
+>         ,LexFails postgres "12e3e4"
+>         ,LexFails postgres "12e3.4"
+>         ,LexFails postgres "12.4.5"
+>         ,LexFails postgres "12.4e5.6"
+>         ,LexFails postgres "12.4e5e7"]
 >     ]
 >  where
->    edgeCaseCommentOps = concat
->      [ [ (x ++ "/*<test*/", [Symbol x, BlockComment "/*<test*/"])
->        , (x ++ "--<test", [Symbol x, LineComment "--<test"]) ]
->      | x <- someValidPostgresOperators 2
+>    edgeCaseCommentOps =
+>      [ (x ++ "/*<test*/", [Symbol x, BlockComment "/*<test*/"])
+>      | x <- eccops
+>      , not (last x == '*')
+>      ] ++
+>      [ (x ++ "--<test", [Symbol x, LineComment "--<test"])
+>      | x <- eccops
 >      , not (last x == '-')
 >      ]
+>    eccops = someValidPostgresOperators 2
 >    edgeCasePlusMinusOps = concat
 >      [ [ (x ++ "+", [Symbol x, Symbol "+"])
 >        , (x ++ "-", [Symbol x, Symbol "-"]) ]
@@ -244,22 +280,22 @@ the + or -.
 
 
 > sqlServerLexerTests :: TestItem
-> sqlServerLexerTests = Group "sqlServerLexerTests" $
->     [ LexerTest sqlserver s t | (s,t) <-
+> sqlServerLexerTests = Group "sqlServerLexTests" $
+>     [ LexTest sqlserver s t | (s,t) <-
 >     [--("@variable", [(Identifier (Just ("@", "")) "variable")])
 >     --,("[quoted identifier]", [(Identifier (Just ("[", "]")) "variable")])
 >     ]]
 
 > oracleLexerTests :: TestItem
-> oracleLexerTests = Group "oracleLexerTests" $
->     [ LexerTest oracle s t | (s,t) <-
+> oracleLexerTests = Group "oracleLexTests" $
+>     [ LexTest oracle s t | (s,t) <-
 >     [--("#variable", [(Identifier (Just ("#", "")) "variable")])
 >     ]
 >     ]
 
 > odbcLexerTests :: TestItem
-> odbcLexerTests = Group "odbcLexerTests" $
->     [ LexerTest sqlserver {- {odbc = True} -} s t | (s,t) <-
+> odbcLexerTests = Group "odbcLexTests" $
+>     [ LexTest sqlserver {- {odbc = True} -} s t | (s,t) <-
 >     [--("{}", [Symbol "{", Symbol "}"])
 >     ]
 >     ]
diff --git a/tools/Language/SQL/SimpleSQL/TestTypes.lhs b/tools/Language/SQL/SimpleSQL/TestTypes.lhs
index 0022a14..b9cb4bc 100644
--- a/tools/Language/SQL/SimpleSQL/TestTypes.lhs
+++ b/tools/Language/SQL/SimpleSQL/TestTypes.lhs
@@ -30,5 +30,6 @@ check that the string given fails to parse
 
 >               | ParseQueryExprFails Dialect String
 >               | ParseValueExprFails Dialect String
->               | LexerTest Dialect String [Token]
+>               | LexTest Dialect String [Token]
+>               | LexFails Dialect String
 >                 deriving (Eq,Show)
diff --git a/tools/Language/SQL/SimpleSQL/Tests.lhs b/tools/Language/SQL/SimpleSQL/Tests.lhs
index ec32437..2ab843e 100644
--- a/tools/Language/SQL/SimpleSQL/Tests.lhs
+++ b/tools/Language/SQL/SimpleSQL/Tests.lhs
@@ -86,7 +86,8 @@ order on the generated documentation.
 > itemToTest (ParseValueExprFails d str) =
 >     toFTest parseValueExpr prettyValueExpr d str
 
-> itemToTest (LexerTest d s ts) = makeLexerTest d s ts
+> itemToTest (LexTest d s ts) = makeLexerTest d s ts
+> itemToTest (LexFails d s) = makeLexingFailsTest d s
 
 > makeLexerTest :: Dialect -> String -> [Token] -> T.TestTree
 > makeLexerTest d s ts = H.testCase s $ do
@@ -95,6 +96,13 @@ order on the generated documentation.
 >     let s' = prettyTokens d $ map snd lx
 >     H.assertEqual "pretty print" s s'
 
+> makeLexingFailsTest :: Dialect -> String -> T.TestTree
+> makeLexingFailsTest d s = H.testCase s $ do
+>     case lexSQL d "" Nothing s of
+>          Right x -> H.assertFailure $ "lexing should have failed: " ++ s ++ "\ngot: " ++ show x
+>          Left _ -> return ()
+
+
 > toTest :: (Eq a, Show a) =>
 >           (Dialect -> String -> Maybe (Int,Int) -> String -> Either ParseError a)
 >        -> (Dialect -> a -> String)