From b4c2276a1f2b31aabd104aadf327381d400a73ea Mon Sep 17 00:00:00 2001 From: Jake Wheat Date: Mon, 15 Feb 2016 20:34:28 +0200 Subject: [PATCH] tidy up the tokensWillPrintAndLex function some more --- Language/SQL/SimpleSQL/Lex.lhs | 168 ++++++++++++++++----------------- 1 file changed, 79 insertions(+), 89 deletions(-) diff --git a/Language/SQL/SimpleSQL/Lex.lhs b/Language/SQL/SimpleSQL/Lex.lhs index b4024d7..a8cf486 100644 --- a/Language/SQL/SimpleSQL/Lex.lhs +++ b/Language/SQL/SimpleSQL/Lex.lhs @@ -576,143 +576,136 @@ successes. > tokensWillPrintAndLex d a b && tokenListWillPrintAndLex d (b:xs) > tokensWillPrintAndLex :: Dialect -> Token -> Token -> Bool - -TODO: add more memoization, e.g. create a wrapper which pretty prints -both tokens so the pretty printed token can be reused in multiple -cases. +> tokensWillPrintAndLex d a b a : followed by an identifier character will look like a host param followed by = or : makes a different symbol -> tokensWillPrintAndLex d (Symbol ":") b -> | (b':_) <- prettyToken d b -> , isIdentifierChar b' || b' `elem` ":=" = False +> | Symbol ":" <- a +> , checkFirstBChar (\x -> isIdentifierChar x || x `elem` ":=") = False two symbols next to eachother will fail if the symbols can combine and -(possibly just the prefix) look like a different symbol, or if they -combine to look like comment markers +(possibly just the prefix) look like a different symbol -> tokensWillPrintAndLex (Dialect {diSyntaxFlavour = Postgres}) (Symbol a) (Symbol x) -> | x `notElem` ["+", "-"] = False -> | or (map (`elem` a) "~!@#%^&|`?") = False +> | Dialect {diSyntaxFlavour = Postgres} <- d +> , Symbol a' <- a +> , Symbol b' <- b +> , b' `notElem` ["+", "-"] || or (map (`elem` a') "~!@#%^&|`?") = False -> tokensWillPrintAndLex _ (Symbol s1) (Symbol s2) -> | (s1,s2) `elem` -> [("<",">") -> ,("<","=") -> ,(">","=") -> ,("!","=") -> ,("|","|") -> ,("||","|") -> ,("|","||") -> ,("||","||") -> ,("<",">=") -> ] = False +check two adjacent symbols in non postgres where the combination +possibilities are much more limited. This is ansi behaviour, it might +be different when the other dialects are done properly -List explicitly all the cases which should fail +> | Symbol a' <- a +> , Symbol b' <- b +> , (a',b') `elem` [("<",">") +> ,("<","=") +> ,(">","=") +> ,("!","=") +> ,("|","|") +> ,("||","|") +> ,("|","||") +> ,("||","||") +> ,("<",">=") +> ] = False two whitespaces will be combined -> tokensWillPrintAndLex _ Whitespace {} Whitespace {} = False +> | Whitespace {} <- a +> , Whitespace {} <- b = False line comment without a newline at the end will eat the next token -> tokensWillPrintAndLex _ (LineComment s@(_:_)) _ = last s == '\n' +> | LineComment {} <- a +> , checkLastAChar (/='\n') = False -this should never happen, but the case satisfies the haskell compiler -and isn't exactly wrong +check the last character of the first token and the first character of +the second token forming a comment start or end symbol -> tokensWillPrintAndLex _ (LineComment []) _ = False - -a token which ends with - followed by another token which starts with -- will turn into a line comment - -> tokensWillPrintAndLex d a b -> | (a'@(_:_),('-':_)) <- (prettyToken d a, prettyToken d b) -> , last a' == '-' = False - -a token which ends with * followed by a / at the start of the next -token will cause a problem - -> tokensWillPrintAndLex d a b -> | (a'@(_:_),('/':_)) <- (prettyToken d a, prettyToken d b) -> , last a' == '*' = False - -The reverse is a problem also: ending with / then the next one -starting with * will create the start of a block comment - -todo: write a helper function for a predicate on the last char of the first token and the first char of the second token since this appears quite a few times - -> tokensWillPrintAndLex d a b -> | (a'@(_:_),('*':_)) <- (prettyToken d a, prettyToken d b) -> , last a' == '/' = False +> | let f '-' '-' = True +> f '/' '*' = True +> f '*' '/' = True +> f _ _ = False +> in checkBorderChars f = False a symbol will absorb a following . -TODO: not 100% on this +TODO: not 100% on this always being bad -> tokensWillPrintAndLex d Symbol {} b -> | ('.':_) <- prettyToken d b = False +> | Symbol {} <- a +> , checkFirstBChar (=='.') = False unquoted identifier followed by an identifier letter -> tokensWillPrintAndLex d (Identifier Nothing _) b -> | (b':_) <- prettyToken d b -> , isIdentifierChar b' = False +> | Identifier Nothing _ <- a +> , checkFirstBChar isIdentifierChar = False -two quoted identifiers with the same quote next to each other will -parse back as one identifier with the quote symbol in the middle +a quoted identifier using ", followed by a " will fail -> tokensWillPrintAndLex _ (Identifier (Just (_,[a])) _) (Identifier (Just ([b],_)) _) -> | a == b = False +> | Identifier (Just (_,"\"")) _ <- a +> , checkFirstBChar (=='"') = False host param followed by an identifier char will be absorbed -> tokensWillPrintAndLex d HostParam {} b -> | (b':_) <- prettyToken d b -> , isIdentifierChar b' = False +> | HostParam {} <- a +> , checkFirstBChar isIdentifierChar = False prefixed variable same: -> tokensWillPrintAndLex d PrefixedVariable {} b -> | (b':_) <- prettyToken d b -> , isIdentifierChar b' = False +> | PrefixedVariable {} <- a +> , checkFirstBChar isIdentifierChar = False a positional arg will absorb a following digit -> tokensWillPrintAndLex d PositionalArg {} b -> | (b':_) <- prettyToken d b -> , isDigit b' = False +> | PositionalArg {} <- a +> , checkFirstBChar isDigit = False a string ending with ' followed by a token starting with ' will be absorbed -> tokensWillPrintAndLex d (SqlString _q00 "'" _s0) b -> | ('\'':_) <- prettyToken d b = False +> | SqlString _ "'" _ <- a +> , checkFirstBChar (=='\'') = False a number followed by a . will fail or be absorbed -> tokensWillPrintAndLex d SqlNumber {} b -> | ('.':_) <- prettyToken d b = False +> | SqlNumber {} <- a +> , checkFirstBChar (=='.') = False a number followed by an e or E will fail or be absorbed -> tokensWillPrintAndLex d SqlNumber {} b -> | ('e':_) <- prettyToken d b = False -> | ('E':_) <- prettyToken d b = False +> | SqlNumber {} <- a +> , checkFirstBChar (\x -> x =='e' || x == 'E') = False two numbers next to eachother will fail or be absorbed -> tokensWillPrintAndLex _ SqlNumber {} SqlNumber {} = False +> | SqlNumber {} <- a +> , SqlNumber {} <- b = False +> +> | otherwise = True + + +> where +> prettya = prettyToken d a +> prettyb = prettyToken d b +> -- helper function to run a predicate on the +> -- last character of the first token and the first +> -- character of the second token +> checkBorderChars f +> | (_:_) <- prettya +> , (fb:_) <- prettyb +> , la <- last prettya +> = f la fb +> checkBorderChars _ = False +> checkFirstBChar f = case prettyb of +> (b':_) -> f b' +> _ -> False +> checkLastAChar f = case prettya of +> (_:_) -> f $ last prettya +> _ -> False + -> tokensWillPrintAndLex _ _ _ = True -todo: special case lexer so a second ., and . and e are not -allowed after exponent when there is no whitespace, even if there -is an unambiguous parse TODO: -refactor the tokenswillprintlex to be based on pretty printing the - individual tokens make the tokenswill print more dialect accurate. Maybe add symbol chars and identifier chars to the dialect definition and use them from here @@ -725,9 +718,6 @@ review existing tables look for refactoring opportunities, especially the token generation tables in the tests -add odbc as a dialect flag and include {} as symbols when enabled - - do some user documentation on lexing, and lexing/dialects start thinking about a more separated design for the dialect handling