From 9fd2970f26c9b530143af683d69ca63453c4b305 Mon Sep 17 00:00:00 2001 From: Jake Wheat Date: Mon, 15 Feb 2016 20:31:06 +0200 Subject: [PATCH] work on lexing: error cases fix so following a number with either . or e or E without whitespace will cause a lexing error remove */ from symbols in postgres (although postgres strictly speaking allows this I think, it is not a good idea) reject */ anywhere with an error reject ||| (or more pipes) in ansi, etc. dialects instead of trying to parse it as something like '||', '|' --- Language/SQL/SimpleSQL/Lex.lhs | 110 ++++++++++++-------- tools/Language/SQL/SimpleSQL/LexerTests.lhs | 78 ++++++++++---- tools/Language/SQL/SimpleSQL/TestTypes.lhs | 3 +- tools/Language/SQL/SimpleSQL/Tests.lhs | 10 +- 4 files changed, 136 insertions(+), 65 deletions(-) diff --git a/Language/SQL/SimpleSQL/Lex.lhs b/Language/SQL/SimpleSQL/Lex.lhs index 302404b..fa4651d 100644 --- a/Language/SQL/SimpleSQL/Lex.lhs +++ b/Language/SQL/SimpleSQL/Lex.lhs @@ -176,6 +176,7 @@ this is also tried before symbol (a .1 will be parsed as a number, but > ,blockComment d > ,sqlNumber d > ,positionalArg d +> ,dontParseEndBlockComment d > ,symbol d > ,sqlWhitespace d] @@ -312,15 +313,20 @@ considered part of the constant; it is an operator applied to the constant. > sqlNumber :: Dialect -> Parser Token -> sqlNumber _ = SqlNumber <$> -> (int (pp dot pp int) -> -- try is used in case we read a dot -> -- and it isn't part of a number -> -- if there are any following digits, then we commit -> -- to it being a number and not something else -> <|> try ((++) <$> dot <*> int)) -> pp expon +> sqlNumber _ = +> SqlNumber <$> completeNumber +> -- this is for definitely avoiding possibly ambiguous source +> <* notFollowedBy (oneOf "eE.") > where +> completeNumber = +> (int (pp dot pp int) +> -- try is used in case we read a dot +> -- and it isn't part of a number +> -- if there are any following digits, then we commit +> -- to it being a number and not something else +> <|> try ((++) <$> dot <*> int)) +> pp expon + > int = many1 digit > dot = string "." > expon = (:) <$> oneOf "eE" <*> sInt @@ -363,9 +369,6 @@ which allows the last character of a multi character symbol to be + or - > allOpSymbols = "+-*/<>=~!@#%^&|`?" -> -- all symbols except - and / which can be used to start -> -- a comment token -> allOpSymbolsNoCommentStarters = filter (`notElem` "-/") allOpSymbols > -- these are the symbols when if part of a multi character > -- operator permit the operator to end with a + or - symbol > exceptionOpSymbols = "~!@#%^&|`?" @@ -380,11 +383,14 @@ which allows the last character of a multi character symbol to be + or > -- operator and we have alread seen one of the 'exception chars' > -- and so we can end with a + or - > moreOpCharsException = do -> c <- oneOf allOpSymbolsNoCommentStarters +> c <- oneOf (filter (`notElem` "-/*") allOpSymbols) > -- make sure we don't parse a comment starting token > -- as part of an operator > <|> try (char '/' <* notFollowedBy (char '*')) > <|> try (char '-' <* notFollowedBy (char '-')) +> -- and make sure we don't parse a block comment end +> -- as part of another symbol +> <|> try (char '*' <* notFollowedBy (char '/')) > (c:) <$> option [] moreOpCharsException > opMoreChars = choice @@ -402,8 +408,10 @@ which allows the last character of a multi character symbol to be + or > <* lookAhead (oneOf allOpSymbols)) > <|> -- parse / check it isn't the start of a /* comment > try (char '/' <* notFollowedBy (char '*')) +> <|> -- make sure we don't parse */ as part of a symbol +> try (char '*' <* notFollowedBy (char '/')) > <|> -- any other ansi operator symbol -> oneOf "*<>=") +> oneOf "<>=") > <*> option [] opMoreChars > ] @@ -416,9 +424,11 @@ which allows the last character of a multi character symbol to be + or try is used because most of the first characters of the two character symbols can also be part of a single character symbol -> regularOp = map (try . string) [">=","<=","!=","<>","||"] -> ++ map (string . (:[])) "+-^*/%~&|<>=" - +> regularOp = map (try . string) [">=","<=","!=","<>"] +> ++ map (string . (:[])) "+-^*/%~&<>=" +> ++ [char '|' *> +> choice ["||" <$ char '|' <* notFollowedBy (char '|') +> ,return "|"]] > symbol _ = > Symbol <$> choice (otherSymbol ++ regularOp) @@ -429,8 +439,11 @@ symbols can also be part of a single character symbol try is used because most of the first characters of the two character symbols can also be part of a single character symbol -> regularOp = map (try . string) [">=","<=","!=","<>","||"] -> ++ map (string . (:[])) "+-^*/%~&|<>=" +> regularOp = map (try . string) [">=","<=","!=","<>"] +> ++ map (string . (:[])) "+-^*/%~&<>=[]" +> ++ [char '|' *> +> choice ["||" <$ char '|' <* notFollowedBy (char '|') +> ,return "|"]] @@ -477,6 +490,18 @@ isn't there. > ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n] +This is to improve user experience: provide an error if we see */ +outside a comment. This could potentially break postgres ops with */ +in (which is a stupid thing to do). In other cases, the user should +write * / instead (I can't think of any cases when this would be valid +syntax though). + +> dontParseEndBlockComment :: Dialect -> Parser Token +> dontParseEndBlockComment _ = +> -- don't use try, then it should commit to the error +> try (string "*/") *> fail "comment end without comment start" + + Some helper combinators > startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String @@ -513,6 +538,10 @@ maybe do some quick checking to make sure this function only gives true negatives: check pairs which return false actually fail to lex or give different symbols in return +a good sanity test for this function is to change it to always return +true, then check that the automated tests return the same number of +successes. + > tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool > tokenListWillPrintAndLex _ [] = True > tokenListWillPrintAndLex _ [_] = True @@ -535,7 +564,8 @@ two symbols next to eachother will fail if the symbols can combine and (possibly just the prefix) look like a different symbol, or if they combine to look like comment markers - +check if the end of one symbol and the start of the next can form a +comment token > tokensWillPrintAndLex d a@(Symbol {}) b@(Symbol {}) > | a'@(_:_) <- prettyToken d a @@ -553,17 +583,13 @@ combine to look like comment markers > ,(">","=") > ,("!","=") > ,("|","|") -> -- todo: make the lexer special case reject ||| and so on -> -- because it is ambiguous -> --,("||","|") +> ,("||","|") > ,("|","||") -> --,("||","||") +> ,("||","||") > ,("<",">=") > ,("-","-") > ,("/","*") -> -- todo: special case make the lexer reject */ next to eachother -> -- because trying to interpret it as *,/ will lead to much worse error messages -> --,("*","/") +> ,("*","/") > ] two whitespaces will be combined @@ -593,11 +619,17 @@ some of the symbol > (_:_) -> last s /= '-' > _ -> True -in other situations a trailing line comment or a leading or trailing -block comment will work +in other situations a trailing line comment will work > tokensWillPrintAndLex _ _ LineComment {} = True -> tokensWillPrintAndLex _ _ BlockComment {} = True + +block comments: make sure there isn't a * symbol immediately before the comment opening + +> tokensWillPrintAndLex d a BlockComment {} = +> case prettyToken d a of +> a'@(_:_) | last a' == '*' -> False +> _ -> True + > tokensWillPrintAndLex _ BlockComment {} _ = True @@ -708,6 +740,11 @@ is an unambiguous parse TODO: +refactor the tokenswillprintlex to be based on pretty printing the +individual tokens + +start adding negative / different parse dialect tests + lex @variable in sql server lex [quoted identifier] in sql server lex #variable in oracle @@ -715,22 +752,11 @@ lex #variable in oracle make a new ctor for @var, #var add token tables and tests for oracle, sql server +review existing tables +look for refactoring opportunities add odbc as a dialect flag and include {} as symbols when enabled -add negative / different parse dialect tests - -refactor the tokenswillprintlex to be based on pretty printing the -individual tokens - -add special cases to lexer: -||| three or more pipes is a syntax error (except for postgres) - (and not ||,| or |,|| e.g.) -extra ., e after number without whitespace is a syntax error - -reject a */ pair of chars instead of parsing it as a symbol or several -symbols. Can this appear inside an operator in postgres? Let's make it -not possible anywhere since this could be really confusing. do some user documentation on lexing, and lexing/dialects diff --git a/tools/Language/SQL/SimpleSQL/LexerTests.lhs b/tools/Language/SQL/SimpleSQL/LexerTests.lhs index 3737091..269d4ec 100644 --- a/tools/Language/SQL/SimpleSQL/LexerTests.lhs +++ b/tools/Language/SQL/SimpleSQL/LexerTests.lhs @@ -68,19 +68,38 @@ Test for the lexer > ansiLexerTests :: TestItem > ansiLexerTests = Group "ansiLexerTests" $ -> [Group "ansi lexer token tests" $ [LexerTest ansi2011 s t | (s,t) <- ansiLexerTable] +> [Group "ansi lexer token tests" $ [LexTest ansi2011 s t | (s,t) <- ansiLexerTable] > ,Group "ansi generated combination lexer tests" $ -> [ LexerTest ansi2011 (s ++ s1) (t ++ t1) +> [ LexTest ansi2011 (s ++ s1) (t ++ t1) > | (s,t) <- ansiLexerTable > , (s1,t1) <- ansiLexerTable > , tokenListWillPrintAndLex ansi2011 $ t ++ t1 > ] -> ,Group "adhoc lexer tests" $ -> map (uncurry $ LexerTest ansi2011) +> ,Group "ansiadhoclexertests" $ +> map (uncurry $ LexTest ansi2011) > [("", []) > ,("-- line com\nstuff", [LineComment "-- line com\n",Identifier Nothing "stuff"]) -> ] +> ] ++ +> [-- want to make sure this gives a parse error +> LexFails ansi2011 "*/" +> -- combinations of pipes: make sure they fail because they could be +> -- ambiguous and it is really unclear when they are or not, and +> -- what the result is even when they are not ambiguous +> ,LexFails ansi2011 "|||" +> ,LexFails ansi2011 "||||" +> ,LexFails ansi2011 "|||||" +> -- another user experience thing: make sure extra trailing +> -- number chars are rejected rather than attempting to parse +> -- if the user means to write something that is rejected by this code, +> -- then they can use whitespace to make it clear and then it will parse +> ,LexFails ansi2011 "12e3e4" +> ,LexFails ansi2011 "12e3e4" +> ,LexFails ansi2011 "12e3e4" +> ,LexFails ansi2011 "12e3.4" +> ,LexFails ansi2011 "12.4.5" +> ,LexFails ansi2011 "12.4e5.6" +> ,LexFails ansi2011 "12.4e5e7"] > ] todo: lexing tests @@ -187,7 +206,7 @@ also: do the testing for the ansi compatibility special cases > [ x > | n <- [1..l] > , x <- combos "+-*/<>=~!@#%^&|`?" n -> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x) +> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x) > , not (last x `elem` "+-") > || or (map (`elem` x) "~!@#%^&|`?") > ] @@ -201,7 +220,7 @@ the + or -. > [ x > | n <- [1..l] > , x <- combos "+-*/<>=" n -> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x) +> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x) > , not (last x `elem` "+-") > ] @@ -209,27 +228,44 @@ the + or -. > postgresLexerTests :: TestItem > postgresLexerTests = Group "postgresLexerTests" $ > [Group "postgres lexer token tests" $ -> [LexerTest postgres s t | (s,t) <- postgresLexerTable ++ postgresExtraOperatorTable] +> [LexTest postgres s t | (s,t) <- postgresLexerTable ++ postgresExtraOperatorTable] > ,Group "postgres generated combination lexer tests" $ -> [ LexerTest postgres (s ++ s1) (t ++ t1) +> [ LexTest postgres (s ++ s1) (t ++ t1) > | (s,t) <- postgresLexerTable > , (s1,t1) <- postgresLexerTable > , tokenListWillPrintAndLex postgres $ t ++ t1 > ] -> ,Group "adhoc postgres lexer tests" $ -> [LexerTest postgres s t +> ,Group "adhoc postgres lexertests" $ +> [LexTest postgres s t > | (s,t) <- edgeCaseCommentOps > ++ edgeCasePlusMinusOps > ++ edgeCasePlusMinusComments] +> ++ +> -- need more tests for */ to make sure it is caught if it is in the middle of a +> -- sequence of symbol letters +> [LexFails postgres "*/" +> ,LexFails postgres "@*/" +> ,LexFails postgres "-*/" +> ,LexFails postgres "12e3e4" +> ,LexFails postgres "12e3e4" +> ,LexFails postgres "12e3e4" +> ,LexFails postgres "12e3.4" +> ,LexFails postgres "12.4.5" +> ,LexFails postgres "12.4e5.6" +> ,LexFails postgres "12.4e5e7"] > ] > where -> edgeCaseCommentOps = concat -> [ [ (x ++ "/* , (x ++ "-- | x <- someValidPostgresOperators 2 +> edgeCaseCommentOps = +> [ (x ++ "/* | x <- eccops +> , not (last x == '*') +> ] ++ +> [ (x ++ "-- | x <- eccops > , not (last x == '-') > ] +> eccops = someValidPostgresOperators 2 > edgeCasePlusMinusOps = concat > [ [ (x ++ "+", [Symbol x, Symbol "+"]) > , (x ++ "-", [Symbol x, Symbol "-"]) ] @@ -244,22 +280,22 @@ the + or -. > sqlServerLexerTests :: TestItem -> sqlServerLexerTests = Group "sqlServerLexerTests" $ -> [ LexerTest sqlserver s t | (s,t) <- +> sqlServerLexerTests = Group "sqlServerLexTests" $ +> [ LexTest sqlserver s t | (s,t) <- > [--("@variable", [(Identifier (Just ("@", "")) "variable")]) > --,("[quoted identifier]", [(Identifier (Just ("[", "]")) "variable")]) > ]] > oracleLexerTests :: TestItem -> oracleLexerTests = Group "oracleLexerTests" $ -> [ LexerTest oracle s t | (s,t) <- +> oracleLexerTests = Group "oracleLexTests" $ +> [ LexTest oracle s t | (s,t) <- > [--("#variable", [(Identifier (Just ("#", "")) "variable")]) > ] > ] > odbcLexerTests :: TestItem -> odbcLexerTests = Group "odbcLexerTests" $ -> [ LexerTest sqlserver {- {odbc = True} -} s t | (s,t) <- +> odbcLexerTests = Group "odbcLexTests" $ +> [ LexTest sqlserver {- {odbc = True} -} s t | (s,t) <- > [--("{}", [Symbol "{", Symbol "}"]) > ] > ] diff --git a/tools/Language/SQL/SimpleSQL/TestTypes.lhs b/tools/Language/SQL/SimpleSQL/TestTypes.lhs index 0022a14..b9cb4bc 100644 --- a/tools/Language/SQL/SimpleSQL/TestTypes.lhs +++ b/tools/Language/SQL/SimpleSQL/TestTypes.lhs @@ -30,5 +30,6 @@ check that the string given fails to parse > | ParseQueryExprFails Dialect String > | ParseValueExprFails Dialect String -> | LexerTest Dialect String [Token] +> | LexTest Dialect String [Token] +> | LexFails Dialect String > deriving (Eq,Show) diff --git a/tools/Language/SQL/SimpleSQL/Tests.lhs b/tools/Language/SQL/SimpleSQL/Tests.lhs index ec32437..2ab843e 100644 --- a/tools/Language/SQL/SimpleSQL/Tests.lhs +++ b/tools/Language/SQL/SimpleSQL/Tests.lhs @@ -86,7 +86,8 @@ order on the generated documentation. > itemToTest (ParseValueExprFails d str) = > toFTest parseValueExpr prettyValueExpr d str -> itemToTest (LexerTest d s ts) = makeLexerTest d s ts +> itemToTest (LexTest d s ts) = makeLexerTest d s ts +> itemToTest (LexFails d s) = makeLexingFailsTest d s > makeLexerTest :: Dialect -> String -> [Token] -> T.TestTree > makeLexerTest d s ts = H.testCase s $ do @@ -95,6 +96,13 @@ order on the generated documentation. > let s' = prettyTokens d $ map snd lx > H.assertEqual "pretty print" s s' +> makeLexingFailsTest :: Dialect -> String -> T.TestTree +> makeLexingFailsTest d s = H.testCase s $ do +> case lexSQL d "" Nothing s of +> Right x -> H.assertFailure $ "lexing should have failed: " ++ s ++ "\ngot: " ++ show x +> Left _ -> return () + + > toTest :: (Eq a, Show a) => > (Dialect -> String -> Maybe (Int,Int) -> String -> Either ParseError a) > -> (Dialect -> a -> String)