work on lexing: error cases

fix so following a number with either . or e or E without whitespace will cause a lexing error remove */ from symbols in postgres (although postgres strictly speaking allows this I think, it is not a good idea) reject */ anywhere with an error reject ||| (or more pipes) in ansi, etc. dialects instead of trying to parse it as something like '||', '|'
2016-02-15 20:31:06 +02:00 · 2016-02-15 20:31:06 +02:00 · 9fd2970f26
commit 9fd2970f26
parent 4bca2fa2ec
4 changed files with 136 additions and 65 deletions
--- a/Language/SQL/SimpleSQL/Lex.lhs
+++ b/Language/SQL/SimpleSQL/Lex.lhs
@ -176,6 +176,7 @@ this is also tried before symbol (a .1 will be parsed as a number, but
 >                     ,blockComment d
 >                     ,sqlNumber d
 >                     ,positionalArg d
 >                     ,dontParseEndBlockComment d
 >                     ,symbol d
 >                     ,sqlWhitespace d]
@ -312,15 +313,20 @@ considered part of the constant; it is an operator applied to the
 constant.
 > sqlNumber :: Dialect -> Parser Token
-> sqlNumber _ = SqlNumber <$>
+> sqlNumber _ =
->     (int <??> (pp dot <??.> pp int)
+>     SqlNumber <$> completeNumber
->      -- try is used in case we read a dot
+>     -- this is for definitely avoiding possibly ambiguous source
->      -- and it isn't part of a number
+>     <* notFollowedBy (oneOf "eE.")
 >      -- if there are any following digits, then we commit
 >      -- to it being a number and not something else
 >      <|> try ((++) <$> dot <*> int))
 >     <??> pp expon
 >   where
 >     completeNumber =
 >       (int <??> (pp dot <??.> pp int)
 >       -- try is used in case we read a dot
 >       -- and it isn't part of a number
 >       -- if there are any following digits, then we commit
 >       -- to it being a number and not something else
 >       <|> try ((++) <$> dot <*> int))
 >       <??> pp expon
 >     int = many1 digit
 >     dot = string "."
 >     expon = (:) <$> oneOf "eE" <*> sInt
@ -363,9 +369,6 @@ which allows the last character of a multi character symbol to be + or
 -
 >    allOpSymbols = "+-*/<>=~!@#%^&|`?"
 >    -- all symbols except - and / which can be used to start
 >    -- a comment token
 >    allOpSymbolsNoCommentStarters = filter (`notElem` "-/") allOpSymbols
 >    -- these are the symbols when if part of a multi character
 >    -- operator permit the operator to end with a + or - symbol
 >    exceptionOpSymbols = "~!@#%^&|`?"
@ -380,11 +383,14 @@ which allows the last character of a multi character symbol to be + or
 >    -- operator and we have alread seen one of the 'exception chars'
 >    -- and so we can end with a + or -
 >    moreOpCharsException = do
->        c <- oneOf allOpSymbolsNoCommentStarters
+>        c <- oneOf (filter (`notElem` "-/*") allOpSymbols)
 >             -- make sure we don't parse a comment starting token
 >             -- as part of an operator
 >             <|> try (char '/' <* notFollowedBy (char '*'))
 >             <|> try (char '-' <* notFollowedBy (char '-'))
 >             -- and make sure we don't parse a block comment end
 >             -- as part of another symbol
 >             <|> try (char '*' <* notFollowedBy (char '/'))
 >        (c:) <$> option [] moreOpCharsException
 >    opMoreChars = choice
@ -402,8 +408,10 @@ which allows the last character of a multi character symbol to be + or
 >                   <* lookAhead (oneOf allOpSymbols))
 >              <|> -- parse / check it isn't the start of a /* comment
 >              try (char '/' <* notFollowedBy (char '*'))
 >              <|> -- make sure we don't parse */ as part of a symbol
 >              try (char '*' <* notFollowedBy (char '/'))
 >              <|> -- any other ansi operator symbol
->              oneOf "*<>=")
+>              oneOf "<>=")
 >         <*> option [] opMoreChars
 >        ]
@ -416,9 +424,11 @@ which allows the last character of a multi character symbol to be + or
 try is used because most of the first characters of the two character
 symbols can also be part of a single character symbol
->    regularOp = map (try . string) [">=","<=","!=","<>","||"]
+>    regularOp = map (try . string) [">=","<=","!=","<>"]
->                ++ map (string . (:[])) "+-^*/%~&|<>="
+>                ++ map (string . (:[])) "+-^*/%~&<>="
-
+>                ++ [char '|' *>
 >                    choice ["||" <$ char '|' <* notFollowedBy (char '|')
 >                           ,return "|"]]
 > symbol _ =
 >    Symbol <$> choice (otherSymbol ++ regularOp)
@ -429,8 +439,11 @@ symbols can also be part of a single character symbol
 try is used because most of the first characters of the two character
 symbols can also be part of a single character symbol
->    regularOp = map (try . string) [">=","<=","!=","<>","||"]
+>    regularOp = map (try . string) [">=","<=","!=","<>"]
->                ++ map (string . (:[])) "+-^*/%~&|<>="
+>                ++ map (string . (:[])) "+-^*/%~&<>=[]"
 >                ++ [char '|' *>
 >                    choice ["||" <$ char '|' <* notFollowedBy (char '|')
 >                           ,return "|"]]
@ -477,6 +490,18 @@ isn't there.
 >              ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
 This is to improve user experience: provide an error if we see */
 outside a comment. This could potentially break postgres ops with */
 in (which is a stupid thing to do). In other cases, the user should
 write * / instead (I can't think of any cases when this would be valid
 syntax though).
 > dontParseEndBlockComment :: Dialect -> Parser Token
 > dontParseEndBlockComment _ =
 >     -- don't use try, then it should commit to the error
 >     try (string "*/") *> fail "comment end without comment start"
 Some helper combinators
 > startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
@ -513,6 +538,10 @@ maybe do some quick checking to make sure this function only gives
 true negatives: check pairs which return false actually fail to lex or
 give different symbols in return
 a good sanity test for this function is to change it to always return
 true, then check that the automated tests return the same number of
 successes.
 > tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool
 > tokenListWillPrintAndLex _ [] = True
 > tokenListWillPrintAndLex _ [_] = True
@ -535,7 +564,8 @@ two symbols next to eachother will fail if the symbols can combine and
 (possibly just the prefix) look like a different symbol, or if they
 combine to look like comment markers
-
+check if the end of one symbol and the start of the next can form a
 comment token
 > tokensWillPrintAndLex d a@(Symbol {}) b@(Symbol {})
 >     | a'@(_:_) <- prettyToken d a
@ -553,17 +583,13 @@ combine to look like comment markers
 >    ,(">","=")
 >    ,("!","=")
 >    ,("|","|")
->    -- todo: make the lexer special case reject ||| and so on
+>    ,("||","|")
 >    -- because it is ambiguous
 >    --,("||","|")
 >    ,("|","||")
->    --,("||","||")
+>    ,("||","||")
 >    ,("<",">=")
 >    ,("-","-")
 >    ,("/","*")
->    -- todo: special case make the lexer reject */ next to eachother
+>    ,("*","/")
 >    -- because trying to interpret it as *,/ will lead to much worse error messages
 >    --,("*","/")
 >    ]
 two whitespaces will be combined
@ -593,11 +619,17 @@ some of the symbol
 >        (_:_) -> last s /= '-'
 >        _ -> True
-in other situations a trailing line comment or a leading or trailing
+in other situations a trailing line comment will work
 block comment will work
 > tokensWillPrintAndLex _ _ LineComment {} = True
-> tokensWillPrintAndLex _ _ BlockComment {} = True
+
 block comments: make sure there isn't a * symbol immediately before the comment opening
 > tokensWillPrintAndLex d a BlockComment {} =
 >     case prettyToken d a of
 >         a'@(_:_) | last a' == '*' -> False
 >         _ -> True
 > tokensWillPrintAndLex _ BlockComment {} _ = True
@ -708,6 +740,11 @@ is an unambiguous parse
 TODO:
 refactor the tokenswillprintlex to be based on pretty printing the
 individual tokens
 start adding negative / different parse dialect tests
 lex @variable in sql server
 lex [quoted identifier] in sql server
 lex #variable in oracle
@ -715,22 +752,11 @@ lex #variable in oracle
 make a new ctor for @var, #var
 add token tables and tests for oracle, sql server
 review existing tables
 look for refactoring opportunities
 add odbc as a dialect flag and include {} as symbols when enabled
 add negative / different parse dialect tests
 refactor the tokenswillprintlex to be based on pretty printing the
 individual tokens
 add special cases to lexer:
 ||| three or more pipes is a syntax error (except for postgres)
    (and not ||,| or |,|| e.g.)
 extra ., e after number without whitespace is a syntax error
 reject a */ pair of chars instead of parsing it as a symbol or several
 symbols. Can this appear inside an operator in postgres? Let's make it
 not possible anywhere since this could be really confusing.
 do some user documentation on lexing, and lexing/dialects
--- a/tools/Language/SQL/SimpleSQL/LexerTests.lhs
+++ b/tools/Language/SQL/SimpleSQL/LexerTests.lhs
@ -68,19 +68,38 @@ Test for the lexer
 > ansiLexerTests :: TestItem
 > ansiLexerTests = Group "ansiLexerTests" $
->     [Group "ansi lexer token tests" $ [LexerTest ansi2011 s t |  (s,t) <- ansiLexerTable]
+>     [Group "ansi lexer token tests" $ [LexTest ansi2011 s t |  (s,t) <- ansiLexerTable]
 >     ,Group "ansi generated combination lexer tests" $
->     [ LexerTest ansi2011 (s ++ s1) (t ++ t1)
+>     [ LexTest ansi2011 (s ++ s1) (t ++ t1)
 >     | (s,t) <- ansiLexerTable
 >     , (s1,t1) <- ansiLexerTable
 >     , tokenListWillPrintAndLex ansi2011 $ t ++ t1
 >     ]
->     ,Group "adhoc lexer tests" $
+>     ,Group "ansiadhoclexertests" $
->        map (uncurry $ LexerTest ansi2011)
+>        map (uncurry $ LexTest ansi2011)
 >        [("", [])
 >        ,("-- line com\nstuff", [LineComment "-- line com\n",Identifier Nothing "stuff"])
->        ]
+>        ] ++
 >        [-- want to make sure this gives a parse error
 >         LexFails ansi2011 "*/"
 >         -- combinations of pipes: make sure they fail because they could be
 >         -- ambiguous and it is really unclear when they are or not, and
 >         -- what the result is even when they are not ambiguous
 >        ,LexFails ansi2011 "|||"
 >        ,LexFails ansi2011 "||||"
 >        ,LexFails ansi2011 "|||||"
 >        -- another user experience thing: make sure extra trailing
 >        -- number chars are rejected rather than attempting to parse
 >        -- if the user means to write something that is rejected by this code,
 >        -- then they can use whitespace to make it clear and then it will parse
 >        ,LexFails ansi2011 "12e3e4"
 >        ,LexFails ansi2011 "12e3e4"
 >        ,LexFails ansi2011 "12e3e4"
 >        ,LexFails ansi2011 "12e3.4"
 >        ,LexFails ansi2011 "12.4.5"
 >        ,LexFails ansi2011 "12.4e5.6"
 >        ,LexFails ansi2011 "12.4e5e7"]
 >      ]
 todo: lexing tests
@ -187,7 +206,7 @@ also: do the testing for the ansi compatibility special cases
 >        [ x
 >        | n <- [1..l]
 >        , x <- combos "+-*/<>=~!@#%^&|`?" n
->        , not ("--" `isInfixOf` x || "/*" `isInfixOf` x)
+>        , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x)
 >        , not (last x `elem` "+-")
 >          || or (map (`elem` x) "~!@#%^&|`?")
 >        ]
@ -201,7 +220,7 @@ the + or -.
 >        [ x
 >        | n <- [1..l]
 >        , x <- combos "+-*/<>=" n
->        , not ("--" `isInfixOf` x || "/*" `isInfixOf` x)
+>        , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x)
 >        , not (last x `elem` "+-")
 >        ]
@ -209,27 +228,44 @@ the + or -.
 > postgresLexerTests :: TestItem
 > postgresLexerTests = Group "postgresLexerTests" $
 >     [Group "postgres lexer token tests" $
->      [LexerTest postgres s t | (s,t) <- postgresLexerTable ++ postgresExtraOperatorTable]
+>      [LexTest postgres s t | (s,t) <- postgresLexerTable ++ postgresExtraOperatorTable]
 >     ,Group "postgres generated combination lexer tests" $
->     [ LexerTest postgres (s ++ s1) (t ++ t1)
+>     [ LexTest postgres (s ++ s1) (t ++ t1)
 >     | (s,t) <- postgresLexerTable
 >     , (s1,t1) <- postgresLexerTable
 >     , tokenListWillPrintAndLex postgres $ t ++ t1
 >     ]
->     ,Group "adhoc postgres lexer tests" $
+>     ,Group "adhoc postgres lexertests" $
->      [LexerTest postgres s t
+>      [LexTest postgres s t
 >      | (s,t) <- edgeCaseCommentOps
 >                 ++ edgeCasePlusMinusOps
 >                 ++ edgeCasePlusMinusComments]
 >      ++
 >       -- need more tests for */ to make sure it is caught if it is in the middle of a
 >       -- sequence of symbol letters
 >         [LexFails postgres "*/"
 >         ,LexFails postgres "@*/"
 >         ,LexFails postgres "-*/"
 >         ,LexFails postgres "12e3e4"
 >         ,LexFails postgres "12e3e4"
 >         ,LexFails postgres "12e3e4"
 >         ,LexFails postgres "12e3.4"
 >         ,LexFails postgres "12.4.5"
 >         ,LexFails postgres "12.4e5.6"
 >         ,LexFails postgres "12.4e5e7"]
 >     ]
 >  where
->    edgeCaseCommentOps = concat
+>    edgeCaseCommentOps =
->      [ [ (x ++ "/*<test*/", [Symbol x, BlockComment "/*<test*/"])
+>      [ (x ++ "/*<test*/", [Symbol x, BlockComment "/*<test*/"])
->        , (x ++ "--<test", [Symbol x, LineComment "--<test"]) ]
+>      | x <- eccops
->      | x <- someValidPostgresOperators 2
+>      , not (last x == '*')
 >      ] ++
 >      [ (x ++ "--<test", [Symbol x, LineComment "--<test"])
 >      | x <- eccops
 >      , not (last x == '-')
 >      ]
 >    eccops = someValidPostgresOperators 2
 >    edgeCasePlusMinusOps = concat
 >      [ [ (x ++ "+", [Symbol x, Symbol "+"])
 >        , (x ++ "-", [Symbol x, Symbol "-"]) ]
@ -244,22 +280,22 @@ the + or -.
 > sqlServerLexerTests :: TestItem
-> sqlServerLexerTests = Group "sqlServerLexerTests" $
+> sqlServerLexerTests = Group "sqlServerLexTests" $
->     [ LexerTest sqlserver s t | (s,t) <-
+>     [ LexTest sqlserver s t | (s,t) <-
 >     [--("@variable", [(Identifier (Just ("@", "")) "variable")])
 >     --,("[quoted identifier]", [(Identifier (Just ("[", "]")) "variable")])
 >     ]]
 > oracleLexerTests :: TestItem
-> oracleLexerTests = Group "oracleLexerTests" $
+> oracleLexerTests = Group "oracleLexTests" $
->     [ LexerTest oracle s t | (s,t) <-
+>     [ LexTest oracle s t | (s,t) <-
 >     [--("#variable", [(Identifier (Just ("#", "")) "variable")])
 >     ]
 >     ]
 > odbcLexerTests :: TestItem
-> odbcLexerTests = Group "odbcLexerTests" $
+> odbcLexerTests = Group "odbcLexTests" $
->     [ LexerTest sqlserver {- {odbc = True} -} s t | (s,t) <-
+>     [ LexTest sqlserver {- {odbc = True} -} s t | (s,t) <-
 >     [--("{}", [Symbol "{", Symbol "}"])
 >     ]
 >     ]
--- a/tools/Language/SQL/SimpleSQL/TestTypes.lhs
+++ b/tools/Language/SQL/SimpleSQL/TestTypes.lhs
@ -30,5 +30,6 @@ check that the string given fails to parse
 >               | ParseQueryExprFails Dialect String
 >               | ParseValueExprFails Dialect String
->               | LexerTest Dialect String [Token]
+>               | LexTest Dialect String [Token]
 >               | LexFails Dialect String
 >                 deriving (Eq,Show)
--- a/tools/Language/SQL/SimpleSQL/Tests.lhs
+++ b/tools/Language/SQL/SimpleSQL/Tests.lhs
@ -86,7 +86,8 @@ order on the generated documentation.
 > itemToTest (ParseValueExprFails d str) =
 >     toFTest parseValueExpr prettyValueExpr d str
-> itemToTest (LexerTest d s ts) = makeLexerTest d s ts
+> itemToTest (LexTest d s ts) = makeLexerTest d s ts
 > itemToTest (LexFails d s) = makeLexingFailsTest d s
 > makeLexerTest :: Dialect -> String -> [Token] -> T.TestTree
 > makeLexerTest d s ts = H.testCase s $ do
@ -95,6 +96,13 @@ order on the generated documentation.
 >     let s' = prettyTokens d $ map snd lx
 >     H.assertEqual "pretty print" s s'
 > makeLexingFailsTest :: Dialect -> String -> T.TestTree
 > makeLexingFailsTest d s = H.testCase s $ do
 >     case lexSQL d "" Nothing s of
 >          Right x -> H.assertFailure $ "lexing should have failed: " ++ s ++ "\ngot: " ++ show x
 >          Left _ -> return ()
 > toTest :: (Eq a, Show a) =>
 >           (Dialect -> String -> Maybe (Int,Int) -> String -> Either ParseError a)
 >        -> (Dialect -> a -> String)