work on lexing: error cases
fix so following a number with either . or e or E without whitespace will cause a lexing error remove */ from symbols in postgres (although postgres strictly speaking allows this I think, it is not a good idea) reject */ anywhere with an error reject ||| (or more pipes) in ansi, etc. dialects instead of trying to parse it as something like '||', '|'
This commit is contained in:
parent
4bca2fa2ec
commit
9fd2970f26
|
@ -176,6 +176,7 @@ this is also tried before symbol (a .1 will be parsed as a number, but
|
|||
> ,blockComment d
|
||||
> ,sqlNumber d
|
||||
> ,positionalArg d
|
||||
> ,dontParseEndBlockComment d
|
||||
> ,symbol d
|
||||
> ,sqlWhitespace d]
|
||||
|
||||
|
@ -312,15 +313,20 @@ considered part of the constant; it is an operator applied to the
|
|||
constant.
|
||||
|
||||
> sqlNumber :: Dialect -> Parser Token
|
||||
> sqlNumber _ = SqlNumber <$>
|
||||
> (int <??> (pp dot <??.> pp int)
|
||||
> -- try is used in case we read a dot
|
||||
> -- and it isn't part of a number
|
||||
> -- if there are any following digits, then we commit
|
||||
> -- to it being a number and not something else
|
||||
> <|> try ((++) <$> dot <*> int))
|
||||
> <??> pp expon
|
||||
> sqlNumber _ =
|
||||
> SqlNumber <$> completeNumber
|
||||
> -- this is for definitely avoiding possibly ambiguous source
|
||||
> <* notFollowedBy (oneOf "eE.")
|
||||
> where
|
||||
> completeNumber =
|
||||
> (int <??> (pp dot <??.> pp int)
|
||||
> -- try is used in case we read a dot
|
||||
> -- and it isn't part of a number
|
||||
> -- if there are any following digits, then we commit
|
||||
> -- to it being a number and not something else
|
||||
> <|> try ((++) <$> dot <*> int))
|
||||
> <??> pp expon
|
||||
|
||||
> int = many1 digit
|
||||
> dot = string "."
|
||||
> expon = (:) <$> oneOf "eE" <*> sInt
|
||||
|
@ -363,9 +369,6 @@ which allows the last character of a multi character symbol to be + or
|
|||
-
|
||||
|
||||
> allOpSymbols = "+-*/<>=~!@#%^&|`?"
|
||||
> -- all symbols except - and / which can be used to start
|
||||
> -- a comment token
|
||||
> allOpSymbolsNoCommentStarters = filter (`notElem` "-/") allOpSymbols
|
||||
> -- these are the symbols when if part of a multi character
|
||||
> -- operator permit the operator to end with a + or - symbol
|
||||
> exceptionOpSymbols = "~!@#%^&|`?"
|
||||
|
@ -380,11 +383,14 @@ which allows the last character of a multi character symbol to be + or
|
|||
> -- operator and we have alread seen one of the 'exception chars'
|
||||
> -- and so we can end with a + or -
|
||||
> moreOpCharsException = do
|
||||
> c <- oneOf allOpSymbolsNoCommentStarters
|
||||
> c <- oneOf (filter (`notElem` "-/*") allOpSymbols)
|
||||
> -- make sure we don't parse a comment starting token
|
||||
> -- as part of an operator
|
||||
> <|> try (char '/' <* notFollowedBy (char '*'))
|
||||
> <|> try (char '-' <* notFollowedBy (char '-'))
|
||||
> -- and make sure we don't parse a block comment end
|
||||
> -- as part of another symbol
|
||||
> <|> try (char '*' <* notFollowedBy (char '/'))
|
||||
> (c:) <$> option [] moreOpCharsException
|
||||
|
||||
> opMoreChars = choice
|
||||
|
@ -402,8 +408,10 @@ which allows the last character of a multi character symbol to be + or
|
|||
> <* lookAhead (oneOf allOpSymbols))
|
||||
> <|> -- parse / check it isn't the start of a /* comment
|
||||
> try (char '/' <* notFollowedBy (char '*'))
|
||||
> <|> -- make sure we don't parse */ as part of a symbol
|
||||
> try (char '*' <* notFollowedBy (char '/'))
|
||||
> <|> -- any other ansi operator symbol
|
||||
> oneOf "*<>=")
|
||||
> oneOf "<>=")
|
||||
> <*> option [] opMoreChars
|
||||
> ]
|
||||
|
||||
|
@ -416,9 +424,11 @@ which allows the last character of a multi character symbol to be + or
|
|||
try is used because most of the first characters of the two character
|
||||
symbols can also be part of a single character symbol
|
||||
|
||||
> regularOp = map (try . string) [">=","<=","!=","<>","||"]
|
||||
> ++ map (string . (:[])) "+-^*/%~&|<>="
|
||||
|
||||
> regularOp = map (try . string) [">=","<=","!=","<>"]
|
||||
> ++ map (string . (:[])) "+-^*/%~&<>="
|
||||
> ++ [char '|' *>
|
||||
> choice ["||" <$ char '|' <* notFollowedBy (char '|')
|
||||
> ,return "|"]]
|
||||
|
||||
> symbol _ =
|
||||
> Symbol <$> choice (otherSymbol ++ regularOp)
|
||||
|
@ -429,8 +439,11 @@ symbols can also be part of a single character symbol
|
|||
try is used because most of the first characters of the two character
|
||||
symbols can also be part of a single character symbol
|
||||
|
||||
> regularOp = map (try . string) [">=","<=","!=","<>","||"]
|
||||
> ++ map (string . (:[])) "+-^*/%~&|<>="
|
||||
> regularOp = map (try . string) [">=","<=","!=","<>"]
|
||||
> ++ map (string . (:[])) "+-^*/%~&<>=[]"
|
||||
> ++ [char '|' *>
|
||||
> choice ["||" <$ char '|' <* notFollowedBy (char '|')
|
||||
> ,return "|"]]
|
||||
|
||||
|
||||
|
||||
|
@ -477,6 +490,18 @@ isn't there.
|
|||
> ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
|
||||
|
||||
|
||||
This is to improve user experience: provide an error if we see */
|
||||
outside a comment. This could potentially break postgres ops with */
|
||||
in (which is a stupid thing to do). In other cases, the user should
|
||||
write * / instead (I can't think of any cases when this would be valid
|
||||
syntax though).
|
||||
|
||||
> dontParseEndBlockComment :: Dialect -> Parser Token
|
||||
> dontParseEndBlockComment _ =
|
||||
> -- don't use try, then it should commit to the error
|
||||
> try (string "*/") *> fail "comment end without comment start"
|
||||
|
||||
|
||||
Some helper combinators
|
||||
|
||||
> startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
|
||||
|
@ -513,6 +538,10 @@ maybe do some quick checking to make sure this function only gives
|
|||
true negatives: check pairs which return false actually fail to lex or
|
||||
give different symbols in return
|
||||
|
||||
a good sanity test for this function is to change it to always return
|
||||
true, then check that the automated tests return the same number of
|
||||
successes.
|
||||
|
||||
> tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool
|
||||
> tokenListWillPrintAndLex _ [] = True
|
||||
> tokenListWillPrintAndLex _ [_] = True
|
||||
|
@ -535,7 +564,8 @@ two symbols next to eachother will fail if the symbols can combine and
|
|||
(possibly just the prefix) look like a different symbol, or if they
|
||||
combine to look like comment markers
|
||||
|
||||
|
||||
check if the end of one symbol and the start of the next can form a
|
||||
comment token
|
||||
|
||||
> tokensWillPrintAndLex d a@(Symbol {}) b@(Symbol {})
|
||||
> | a'@(_:_) <- prettyToken d a
|
||||
|
@ -553,17 +583,13 @@ combine to look like comment markers
|
|||
> ,(">","=")
|
||||
> ,("!","=")
|
||||
> ,("|","|")
|
||||
> -- todo: make the lexer special case reject ||| and so on
|
||||
> -- because it is ambiguous
|
||||
> --,("||","|")
|
||||
> ,("||","|")
|
||||
> ,("|","||")
|
||||
> --,("||","||")
|
||||
> ,("||","||")
|
||||
> ,("<",">=")
|
||||
> ,("-","-")
|
||||
> ,("/","*")
|
||||
> -- todo: special case make the lexer reject */ next to eachother
|
||||
> -- because trying to interpret it as *,/ will lead to much worse error messages
|
||||
> --,("*","/")
|
||||
> ,("*","/")
|
||||
> ]
|
||||
|
||||
two whitespaces will be combined
|
||||
|
@ -593,11 +619,17 @@ some of the symbol
|
|||
> (_:_) -> last s /= '-'
|
||||
> _ -> True
|
||||
|
||||
in other situations a trailing line comment or a leading or trailing
|
||||
block comment will work
|
||||
in other situations a trailing line comment will work
|
||||
|
||||
> tokensWillPrintAndLex _ _ LineComment {} = True
|
||||
> tokensWillPrintAndLex _ _ BlockComment {} = True
|
||||
|
||||
block comments: make sure there isn't a * symbol immediately before the comment opening
|
||||
|
||||
> tokensWillPrintAndLex d a BlockComment {} =
|
||||
> case prettyToken d a of
|
||||
> a'@(_:_) | last a' == '*' -> False
|
||||
> _ -> True
|
||||
|
||||
> tokensWillPrintAndLex _ BlockComment {} _ = True
|
||||
|
||||
|
||||
|
@ -708,6 +740,11 @@ is an unambiguous parse
|
|||
|
||||
TODO:
|
||||
|
||||
refactor the tokenswillprintlex to be based on pretty printing the
|
||||
individual tokens
|
||||
|
||||
start adding negative / different parse dialect tests
|
||||
|
||||
lex @variable in sql server
|
||||
lex [quoted identifier] in sql server
|
||||
lex #variable in oracle
|
||||
|
@ -715,22 +752,11 @@ lex #variable in oracle
|
|||
make a new ctor for @var, #var
|
||||
|
||||
add token tables and tests for oracle, sql server
|
||||
review existing tables
|
||||
look for refactoring opportunities
|
||||
|
||||
add odbc as a dialect flag and include {} as symbols when enabled
|
||||
|
||||
add negative / different parse dialect tests
|
||||
|
||||
refactor the tokenswillprintlex to be based on pretty printing the
|
||||
individual tokens
|
||||
|
||||
add special cases to lexer:
|
||||
||| three or more pipes is a syntax error (except for postgres)
|
||||
(and not ||,| or |,|| e.g.)
|
||||
extra ., e after number without whitespace is a syntax error
|
||||
|
||||
reject a */ pair of chars instead of parsing it as a symbol or several
|
||||
symbols. Can this appear inside an operator in postgres? Let's make it
|
||||
not possible anywhere since this could be really confusing.
|
||||
|
||||
do some user documentation on lexing, and lexing/dialects
|
||||
|
||||
|
|
|
@ -68,19 +68,38 @@ Test for the lexer
|
|||
|
||||
> ansiLexerTests :: TestItem
|
||||
> ansiLexerTests = Group "ansiLexerTests" $
|
||||
> [Group "ansi lexer token tests" $ [LexerTest ansi2011 s t | (s,t) <- ansiLexerTable]
|
||||
> [Group "ansi lexer token tests" $ [LexTest ansi2011 s t | (s,t) <- ansiLexerTable]
|
||||
> ,Group "ansi generated combination lexer tests" $
|
||||
> [ LexerTest ansi2011 (s ++ s1) (t ++ t1)
|
||||
> [ LexTest ansi2011 (s ++ s1) (t ++ t1)
|
||||
> | (s,t) <- ansiLexerTable
|
||||
> , (s1,t1) <- ansiLexerTable
|
||||
> , tokenListWillPrintAndLex ansi2011 $ t ++ t1
|
||||
|
||||
> ]
|
||||
> ,Group "adhoc lexer tests" $
|
||||
> map (uncurry $ LexerTest ansi2011)
|
||||
> ,Group "ansiadhoclexertests" $
|
||||
> map (uncurry $ LexTest ansi2011)
|
||||
> [("", [])
|
||||
> ,("-- line com\nstuff", [LineComment "-- line com\n",Identifier Nothing "stuff"])
|
||||
> ]
|
||||
> ] ++
|
||||
> [-- want to make sure this gives a parse error
|
||||
> LexFails ansi2011 "*/"
|
||||
> -- combinations of pipes: make sure they fail because they could be
|
||||
> -- ambiguous and it is really unclear when they are or not, and
|
||||
> -- what the result is even when they are not ambiguous
|
||||
> ,LexFails ansi2011 "|||"
|
||||
> ,LexFails ansi2011 "||||"
|
||||
> ,LexFails ansi2011 "|||||"
|
||||
> -- another user experience thing: make sure extra trailing
|
||||
> -- number chars are rejected rather than attempting to parse
|
||||
> -- if the user means to write something that is rejected by this code,
|
||||
> -- then they can use whitespace to make it clear and then it will parse
|
||||
> ,LexFails ansi2011 "12e3e4"
|
||||
> ,LexFails ansi2011 "12e3e4"
|
||||
> ,LexFails ansi2011 "12e3e4"
|
||||
> ,LexFails ansi2011 "12e3.4"
|
||||
> ,LexFails ansi2011 "12.4.5"
|
||||
> ,LexFails ansi2011 "12.4e5.6"
|
||||
> ,LexFails ansi2011 "12.4e5e7"]
|
||||
> ]
|
||||
|
||||
todo: lexing tests
|
||||
|
@ -187,7 +206,7 @@ also: do the testing for the ansi compatibility special cases
|
|||
> [ x
|
||||
> | n <- [1..l]
|
||||
> , x <- combos "+-*/<>=~!@#%^&|`?" n
|
||||
> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x)
|
||||
> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x)
|
||||
> , not (last x `elem` "+-")
|
||||
> || or (map (`elem` x) "~!@#%^&|`?")
|
||||
> ]
|
||||
|
@ -201,7 +220,7 @@ the + or -.
|
|||
> [ x
|
||||
> | n <- [1..l]
|
||||
> , x <- combos "+-*/<>=" n
|
||||
> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x)
|
||||
> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x)
|
||||
> , not (last x `elem` "+-")
|
||||
> ]
|
||||
|
||||
|
@ -209,27 +228,44 @@ the + or -.
|
|||
> postgresLexerTests :: TestItem
|
||||
> postgresLexerTests = Group "postgresLexerTests" $
|
||||
> [Group "postgres lexer token tests" $
|
||||
> [LexerTest postgres s t | (s,t) <- postgresLexerTable ++ postgresExtraOperatorTable]
|
||||
> [LexTest postgres s t | (s,t) <- postgresLexerTable ++ postgresExtraOperatorTable]
|
||||
> ,Group "postgres generated combination lexer tests" $
|
||||
> [ LexerTest postgres (s ++ s1) (t ++ t1)
|
||||
> [ LexTest postgres (s ++ s1) (t ++ t1)
|
||||
> | (s,t) <- postgresLexerTable
|
||||
> , (s1,t1) <- postgresLexerTable
|
||||
> , tokenListWillPrintAndLex postgres $ t ++ t1
|
||||
|
||||
> ]
|
||||
> ,Group "adhoc postgres lexer tests" $
|
||||
> [LexerTest postgres s t
|
||||
> ,Group "adhoc postgres lexertests" $
|
||||
> [LexTest postgres s t
|
||||
> | (s,t) <- edgeCaseCommentOps
|
||||
> ++ edgeCasePlusMinusOps
|
||||
> ++ edgeCasePlusMinusComments]
|
||||
> ++
|
||||
> -- need more tests for */ to make sure it is caught if it is in the middle of a
|
||||
> -- sequence of symbol letters
|
||||
> [LexFails postgres "*/"
|
||||
> ,LexFails postgres "@*/"
|
||||
> ,LexFails postgres "-*/"
|
||||
> ,LexFails postgres "12e3e4"
|
||||
> ,LexFails postgres "12e3e4"
|
||||
> ,LexFails postgres "12e3e4"
|
||||
> ,LexFails postgres "12e3.4"
|
||||
> ,LexFails postgres "12.4.5"
|
||||
> ,LexFails postgres "12.4e5.6"
|
||||
> ,LexFails postgres "12.4e5e7"]
|
||||
> ]
|
||||
> where
|
||||
> edgeCaseCommentOps = concat
|
||||
> [ [ (x ++ "/*<test*/", [Symbol x, BlockComment "/*<test*/"])
|
||||
> , (x ++ "--<test", [Symbol x, LineComment "--<test"]) ]
|
||||
> | x <- someValidPostgresOperators 2
|
||||
> edgeCaseCommentOps =
|
||||
> [ (x ++ "/*<test*/", [Symbol x, BlockComment "/*<test*/"])
|
||||
> | x <- eccops
|
||||
> , not (last x == '*')
|
||||
> ] ++
|
||||
> [ (x ++ "--<test", [Symbol x, LineComment "--<test"])
|
||||
> | x <- eccops
|
||||
> , not (last x == '-')
|
||||
> ]
|
||||
> eccops = someValidPostgresOperators 2
|
||||
> edgeCasePlusMinusOps = concat
|
||||
> [ [ (x ++ "+", [Symbol x, Symbol "+"])
|
||||
> , (x ++ "-", [Symbol x, Symbol "-"]) ]
|
||||
|
@ -244,22 +280,22 @@ the + or -.
|
|||
|
||||
|
||||
> sqlServerLexerTests :: TestItem
|
||||
> sqlServerLexerTests = Group "sqlServerLexerTests" $
|
||||
> [ LexerTest sqlserver s t | (s,t) <-
|
||||
> sqlServerLexerTests = Group "sqlServerLexTests" $
|
||||
> [ LexTest sqlserver s t | (s,t) <-
|
||||
> [--("@variable", [(Identifier (Just ("@", "")) "variable")])
|
||||
> --,("[quoted identifier]", [(Identifier (Just ("[", "]")) "variable")])
|
||||
> ]]
|
||||
|
||||
> oracleLexerTests :: TestItem
|
||||
> oracleLexerTests = Group "oracleLexerTests" $
|
||||
> [ LexerTest oracle s t | (s,t) <-
|
||||
> oracleLexerTests = Group "oracleLexTests" $
|
||||
> [ LexTest oracle s t | (s,t) <-
|
||||
> [--("#variable", [(Identifier (Just ("#", "")) "variable")])
|
||||
> ]
|
||||
> ]
|
||||
|
||||
> odbcLexerTests :: TestItem
|
||||
> odbcLexerTests = Group "odbcLexerTests" $
|
||||
> [ LexerTest sqlserver {- {odbc = True} -} s t | (s,t) <-
|
||||
> odbcLexerTests = Group "odbcLexTests" $
|
||||
> [ LexTest sqlserver {- {odbc = True} -} s t | (s,t) <-
|
||||
> [--("{}", [Symbol "{", Symbol "}"])
|
||||
> ]
|
||||
> ]
|
||||
|
|
|
@ -30,5 +30,6 @@ check that the string given fails to parse
|
|||
|
||||
> | ParseQueryExprFails Dialect String
|
||||
> | ParseValueExprFails Dialect String
|
||||
> | LexerTest Dialect String [Token]
|
||||
> | LexTest Dialect String [Token]
|
||||
> | LexFails Dialect String
|
||||
> deriving (Eq,Show)
|
||||
|
|
|
@ -86,7 +86,8 @@ order on the generated documentation.
|
|||
> itemToTest (ParseValueExprFails d str) =
|
||||
> toFTest parseValueExpr prettyValueExpr d str
|
||||
|
||||
> itemToTest (LexerTest d s ts) = makeLexerTest d s ts
|
||||
> itemToTest (LexTest d s ts) = makeLexerTest d s ts
|
||||
> itemToTest (LexFails d s) = makeLexingFailsTest d s
|
||||
|
||||
> makeLexerTest :: Dialect -> String -> [Token] -> T.TestTree
|
||||
> makeLexerTest d s ts = H.testCase s $ do
|
||||
|
@ -95,6 +96,13 @@ order on the generated documentation.
|
|||
> let s' = prettyTokens d $ map snd lx
|
||||
> H.assertEqual "pretty print" s s'
|
||||
|
||||
> makeLexingFailsTest :: Dialect -> String -> T.TestTree
|
||||
> makeLexingFailsTest d s = H.testCase s $ do
|
||||
> case lexSQL d "" Nothing s of
|
||||
> Right x -> H.assertFailure $ "lexing should have failed: " ++ s ++ "\ngot: " ++ show x
|
||||
> Left _ -> return ()
|
||||
|
||||
|
||||
> toTest :: (Eq a, Show a) =>
|
||||
> (Dialect -> String -> Maybe (Int,Int) -> String -> Either ParseError a)
|
||||
> -> (Dialect -> a -> String)
|
||||
|
|
Loading…
Reference in a new issue