tidy up the tokensWillPrintAndLex function some more

2016-02-15 20:34:28 +02:00 · 2016-02-15 20:34:28 +02:00 · b4c2276a1f
commit b4c2276a1f
parent 36109ce584
1 changed files with 79 additions and 89 deletions
--- a/Language/SQL/SimpleSQL/Lex.lhs
+++ b/Language/SQL/SimpleSQL/Lex.lhs
@ -576,143 +576,136 @@ successes.
 >     tokensWillPrintAndLex d a b && tokenListWillPrintAndLex d (b:xs)
 > tokensWillPrintAndLex :: Dialect -> Token -> Token -> Bool
-
+> tokensWillPrintAndLex d a b
 TODO: add more memoization, e.g. create a wrapper which pretty prints
 both tokens so the pretty printed token can be reused in multiple
 cases.
 a : followed by an identifier character will look like a host param
 followed by = or : makes a different symbol
-> tokensWillPrintAndLex d (Symbol ":") b
+>     | Symbol ":" <- a
->     | (b':_) <- prettyToken d b
+>     , checkFirstBChar (\x -> isIdentifierChar x || x `elem` ":=") = False
 >     , isIdentifierChar b' || b' `elem` ":=" = False
 two symbols next to eachother will fail if the symbols can combine and
-(possibly just the prefix) look like a different symbol, or if they
+(possibly just the prefix) look like a different symbol
 combine to look like comment markers
-> tokensWillPrintAndLex (Dialect {diSyntaxFlavour = Postgres}) (Symbol a) (Symbol x)
+>     | Dialect {diSyntaxFlavour = Postgres} <- d
->     | x `notElem` ["+", "-"] = False
+>     , Symbol a' <- a
->     | or (map (`elem` a) "~!@#%^&|`?") = False
+>     , Symbol b' <- b
 >     , b' `notElem` ["+", "-"] || or (map (`elem` a') "~!@#%^&|`?") = False
-> tokensWillPrintAndLex _ (Symbol s1) (Symbol s2)
+check two adjacent symbols in non postgres where the combination
->    | (s1,s2) `elem`
+possibilities are much more limited. This is ansi behaviour, it might
->      [("<",">")
+be different when the other dialects are done properly
 >      ,("<","=")
 >      ,(">","=")
 >      ,("!","=")
 >      ,("|","|")
 >      ,("||","|")
 >      ,("|","||")
 >      ,("||","||")
 >      ,("<",">=")
 >      ] = False
-List explicitly all the cases which should fail
+>    | Symbol a' <- a
 >    , Symbol b' <- b
 >    , (a',b') `elem` [("<",">")
 >                     ,("<","=")
 >                     ,(">","=")
 >                     ,("!","=")
 >                     ,("|","|")
 >                     ,("||","|")
 >                     ,("|","||")
 >                     ,("||","||")
 >                     ,("<",">=")
 >                     ] = False
 two whitespaces will be combined
-> tokensWillPrintAndLex _ Whitespace {} Whitespace {} = False
+>    | Whitespace {} <- a
 >    , Whitespace {} <- b = False
 line comment without a newline at the end will eat the next token
-> tokensWillPrintAndLex _ (LineComment s@(_:_)) _ = last s == '\n'
+>    | LineComment {} <- a
 >    , checkLastAChar (/='\n') = False
-this should never happen, but the case satisfies the haskell compiler
+check the last character of the first token and the first character of
-and isn't exactly wrong
+the second token forming a comment start or end symbol
-> tokensWillPrintAndLex _ (LineComment []) _ = False
+>    | let f '-' '-' = True
-
+>          f '/' '*' = True
-a token which ends with - followed by another token which starts with
+>          f '*' '/' = True
- will turn into a line comment
+>          f _ _ = False
-
+>      in checkBorderChars f = False
 > tokensWillPrintAndLex d a b
 >     | (a'@(_:_),('-':_)) <- (prettyToken d a, prettyToken d b)
 >     , last a' == '-' = False
 a token which ends with * followed by a / at the start of the next
 token will cause a problem
 > tokensWillPrintAndLex d a b
 >     | (a'@(_:_),('/':_)) <- (prettyToken d a, prettyToken d b)
 >     , last a' == '*' = False
 The reverse is a problem also: ending with / then the next one
 starting with * will create the start of a block comment
 todo: write a helper function for a predicate on the last char of the first token and the first char of the second token since this appears quite a few times
 > tokensWillPrintAndLex d a b
 >     | (a'@(_:_),('*':_)) <- (prettyToken d a, prettyToken d b)
 >     , last a' == '/' = False
 a symbol will absorb a following .
-TODO: not 100% on this
+TODO: not 100% on this always being bad
-> tokensWillPrintAndLex d Symbol {} b
+>    |  Symbol {} <- a
->     | ('.':_) <- prettyToken d b = False
+>    , checkFirstBChar (=='.') = False
 unquoted identifier followed by an identifier letter
-> tokensWillPrintAndLex d (Identifier Nothing _) b
+>    | Identifier Nothing _ <- a
->     | (b':_) <- prettyToken d b
+>    , checkFirstBChar isIdentifierChar = False
 >     , isIdentifierChar b' = False
-two quoted identifiers with the same quote next to each other will
+a quoted identifier using ", followed by a " will fail
 parse back as one identifier with the quote symbol in the middle
-> tokensWillPrintAndLex _ (Identifier (Just (_,[a])) _) (Identifier (Just ([b],_)) _)
+>    | Identifier (Just (_,"\"")) _ <- a
->     | a == b = False
+>    , checkFirstBChar (=='"') = False
 host param followed by an identifier char will be absorbed
-> tokensWillPrintAndLex d HostParam {} b
+>    | HostParam {} <- a
->     | (b':_) <- prettyToken d b
+>    , checkFirstBChar isIdentifierChar = False
 >     , isIdentifierChar b' = False
 prefixed variable same:
-> tokensWillPrintAndLex d PrefixedVariable {} b
+>    | PrefixedVariable {} <- a
->     | (b':_) <- prettyToken d b
+>    , checkFirstBChar isIdentifierChar = False
 >     , isIdentifierChar b' = False
 a positional arg will absorb a following digit
-> tokensWillPrintAndLex d PositionalArg {} b
+>    | PositionalArg {} <- a
->     | (b':_) <- prettyToken d b
+>    , checkFirstBChar isDigit = False
 >     , isDigit b' = False
 a string ending with ' followed by a token starting with ' will be absorbed
-> tokensWillPrintAndLex d (SqlString _q00 "'" _s0) b
+>    | SqlString _ "'" _ <- a
->     | ('\'':_) <- prettyToken d b = False
+>    , checkFirstBChar (=='\'') = False
 a number followed by a . will fail or be absorbed
-> tokensWillPrintAndLex d SqlNumber {} b
+>    | SqlNumber {} <- a
->     | ('.':_) <- prettyToken d b = False
+>    , checkFirstBChar (=='.') = False
 a number followed by an e or E will fail or be absorbed
-> tokensWillPrintAndLex d SqlNumber {} b
+>    | SqlNumber {} <- a
->     | ('e':_) <- prettyToken d b = False
+>    , checkFirstBChar (\x -> x =='e' || x == 'E') = False
 >     | ('E':_) <- prettyToken d b = False
 two numbers next to eachother will fail or be absorbed
-> tokensWillPrintAndLex _ SqlNumber {} SqlNumber {} = False
+>    | SqlNumber {} <- a
 >    , SqlNumber {} <- b = False
 >
 >    | otherwise = True
 >   where
 >     prettya = prettyToken d a
 >     prettyb = prettyToken d b
 >     -- helper function to run a predicate on the
 >     -- last character of the first token and the first
 >     -- character of the second token
 >     checkBorderChars f
 >         | (_:_) <- prettya
 >         , (fb:_) <- prettyb
 >         , la <- last prettya
 >         = f la fb
 >     checkBorderChars _ = False
 >     checkFirstBChar f = case prettyb of
 >                           (b':_) -> f b'
 >                           _ -> False
 >     checkLastAChar f = case prettya of
 >                           (_:_) -> f $ last prettya
 >                           _ -> False
 > tokensWillPrintAndLex _ _ _ = True
 todo: special case lexer so a second ., and . and e are not
 allowed after exponent when there is no whitespace, even if there
 is an unambiguous parse
 TODO:
 refactor the tokenswillprintlex to be based on pretty printing the
  individual tokens
 make the tokenswill print more dialect accurate. Maybe add symbol
  chars and identifier chars to the dialect definition and use them from
  here
@ -725,9 +718,6 @@ review existing tables
 look for refactoring opportunities, especially the token
 generation tables in the tests
 add odbc as a dialect flag and include {} as symbols when enabled
 do some user documentation on lexing, and lexing/dialects
 start thinking about a more separated design for the dialect handling