work on lexing

add utility function to tell if two tokens will pretty print then lex back to the same two tokens or not add notes for some final missing lexing bits that are in hssqlppp add token combo tests for postgres add start of sql server, oracle and odbc lexing tests
2016-02-15 20:20:24 +02:00 · 2016-02-15 20:20:24 +02:00 · 4bca2fa2ec
commit 4bca2fa2ec
parent 47198c78c1
2 changed files with 275 additions and 239 deletions
--- a/Language/SQL/SimpleSQL/Lex.lhs
+++ b/Language/SQL/SimpleSQL/Lex.lhs
@ -44,7 +44,10 @@ todo: public documentation on dialect definition - and dialect flags
 >     ,prettyToken
 >     ,prettyTokens
 >     ,ParseError(..)
->     ,Dialect(..)) where
+>     ,Dialect(..)
+>     ,tokensWillPrintAndLex
+>     ,tokenListWillPrintAndLex
+>     ) where

 > import Language.SQL.SimpleSQL.Dialect

@ -68,7 +71,7 @@ todo: public documentation on dialect definition - and dialect flags

 > -- | Represents a lexed token
 > data Token
->     -- | A symbol is one of the following
+>     -- | A symbol (in ansi dialect) is one of the following
 >     --
 >     -- * multi char symbols <> <= >= != ||
 >     -- * single char symbols: * + -  < >  ^ / %  ~ & | ? ( ) [ ] , ; ( )
@ -404,6 +407,18 @@ which allows the last character of a multi character symbol to be + or
 >         <*> option [] opMoreChars
 >        ]

+> symbol d | diSyntaxFlavour d == SQLServer =
+>    Symbol <$> choice (otherSymbol ++ regularOp)
+>  where
+>    otherSymbol = many1 (char '.') :
+>                  map (string . (:[])) ",;():?"
+
+try is used because most of the first characters of the two character
+symbols can also be part of a single character symbol
+
+>    regularOp = map (try . string) [">=","<=","!=","<>","||"]
+>                ++ map (string . (:[])) "+-^*/%~&|<>="
+

 > symbol _ =
 >    Symbol <$> choice (otherSymbol ++ regularOp)
@ -477,190 +492,246 @@ Some helper combinators
 > takeWhile p = many (satisfy p)

 > takeTill :: (Char -> Bool) -> Parser String
-> takeTill p =
->     manyTill anyChar (peekSatisfy p)
+> takeTill p = manyTill anyChar (peekSatisfy p)

 > peekSatisfy :: (Char -> Bool) -> Parser ()
-> peekSatisfy p = do
->     void $ lookAhead (satisfy p)
+> peekSatisfy p = void $ lookAhead (satisfy p)
+
+This utility function will accurately report if the two tokens are
+pretty printed, if they should lex back to the same two tokens. This
+function is used in testing (and can be used in other places), and
+must not be implemented by actually trying to print and then lex
+(because then we would have the risk of thinking two tokens cannot be
+together when there is bug in the lexer and it should be possible to
+put them together.
+
+question: maybe pretty printing the tokens separately and then
+analysing the concrete syntax without concatting the two printed
+tokens together is a better way of doing this?
+
+maybe do some quick checking to make sure this function only gives
+true negatives: check pairs which return false actually fail to lex or
+give different symbols in return
+
+> tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool
+> tokenListWillPrintAndLex _ [] = True
+> tokenListWillPrintAndLex _ [_] = True
+> tokenListWillPrintAndLex d (a:b:xs) =
+>     tokensWillPrintAndLex d a b && tokenListWillPrintAndLex d (b:xs)
+
+> tokensWillPrintAndLex :: Dialect -> Token -> Token -> Bool
+
+> tokensWillPrintAndLex d (Symbol ":") x =
+>     case prettyToken d x of
+>         -- eliminate cases:
+>         -- first letter of pretty x can be start of identifier
+>         -- this will look like a hostparam
+>         -- first letter of x is :, this will look like ::
+>         -- first letter of x is =, this will look like :=
+>         (a:_) | a `elem` ":_=" || isAlpha a -> False
+>         _ -> True
+
+two symbols next to eachother will fail if the symbols can combine and
+(possibly just the prefix) look like a different symbol, or if they
+combine to look like comment markers


-postgresql notes:
-u&
-SELECT 'foo'
-'bar';
-is equivalent to:
-SELECT 'foobar';

-SELECT 'foo'      'bar';
-is invalid
+> tokensWillPrintAndLex d a@(Symbol {}) b@(Symbol {})
+>     | a'@(_:_) <- prettyToken d a
+>     , ('-':_) <- prettyToken d b
+>     , last a' == '-' = False

-(this should be in ansi also)
+> tokensWillPrintAndLex (Dialect {diSyntaxFlavour = Postgres}) (Symbol a) (Symbol x) =
+>     (x `elem` ["+", "-"])
+>     && and (map (`notElem` a) "~!@#%^&|`?")

-definitely do major review and docs:
-when can escapes and prefixes be using with syntactic string literals
-when can they be combined
-when can e.g. dollar quoting be used
-what escaping should there be, including unicode escapes
+> tokensWillPrintAndLex _ (Symbol s1) (Symbol s2) =
+>    (s1,s2) `notElem`
+>    [("<",">")
+>    ,("<","=")
+>    ,(">","=")
+>    ,("!","=")
+>    ,("|","|")
+>    -- todo: make the lexer special case reject ||| and so on
+>    -- because it is ambiguous
+>    --,("||","|")
+>    ,("|","||")
+>    --,("||","||")
+>    ,("<",">=")
+>    ,("-","-")
+>    ,("/","*")
+>    -- todo: special case make the lexer reject */ next to eachother
+>    -- because trying to interpret it as *,/ will lead to much worse error messages
+>    --,("*","/")
+>    ]
+
+two whitespaces will be combined
+
+> tokensWillPrintAndLex _ Whitespace {} Whitespace {} = False
+
+line comment without a newline at the end will eat the next token
+
+> tokensWillPrintAndLex _ (LineComment s@(_:_)) _ = last s == '\n'
+
+this should never happen, but the case satisfies the haskell compiler
+and isn't exactly wrong
+
+> tokensWillPrintAndLex _ (LineComment []) _ = False
+
+apart from two above cases, leading and trailing whitespace will always be ok
+
+> tokensWillPrintAndLex _ Whitespace {} _ = True
+> tokensWillPrintAndLex _ _ Whitespace {} = True
+
+a symbol ending with a '-' followed by a line comment will lex back
+differently, since the --- will combine and move the comment eating
+some of the symbol
+
+> tokensWillPrintAndLex _ (Symbol s) (LineComment {}) =
+>    case s of
+>        (_:_) -> last s /= '-'
+>        _ -> True
+
+in other situations a trailing line comment or a leading or trailing
+block comment will work
+
+> tokensWillPrintAndLex _ _ LineComment {} = True
+> tokensWillPrintAndLex _ _ BlockComment {} = True
+> tokensWillPrintAndLex _ BlockComment {} _ = True


-E'string'

-with a range of escapes which should appear in the dialect data type
+> tokensWillPrintAndLex _ Symbol {} Identifier {} = True

-dollar quoted strings
-never with prefixes/escapes
-
-B''
-X''
-
-numbers
-
-:: cast
+> tokensWillPrintAndLex _ Symbol {} HostParam {} = True
+> tokensWillPrintAndLex _ Symbol {} PositionalArg {} = True
+> tokensWillPrintAndLex _ Symbol {} SqlString {} = True
+> tokensWillPrintAndLex (Dialect {diSyntaxFlavour = Postgres}) Symbol {} (SqlNumber ('.':_)) = False
+> tokensWillPrintAndLex _ Symbol {} SqlNumber {} = True


-type 'string' -  literals only, not array types
-  ansi allows for some specific types
-'string'::type
-cast('string' as type)
+identifier:
+  symbol ok
+  identifier:
+    alphas then alphas: bad
+    quote then quote (with same start and end quote): bad
+    quote [ ] then quote [ ]: ok? this technically works, not sure if
+    it is a good ui, or requiring whitepace/comment is better. See
+    what sql server does
+    second is quote with prefix: makes it ok
+  host param: ok, but maybe should require whitespace for ui reasons
+  positional arg: ok, but maybe should require whitespace for ui reasons
+  string: ok, but maybe should require whitespace for ui reasons
+  number: ok, but maybe should require whitespace for ui reasons

-can use dollar quoting here
-typename('string') (not all types)
-check these in the parser for keyword issues
+> tokensWillPrintAndLex _ Identifier {} Symbol {} = True
+> tokensWillPrintAndLex _ (Identifier Nothing _) (Identifier Nothing _) = False
+> tokensWillPrintAndLex _ (Identifier Nothing _) (Identifier (Just (a,_)) _) =
+>     case a of
+>         (a':_) | isAlpha a' -> False
+>         _ -> True
+> tokensWillPrintAndLex _ (Identifier Just {} _) (Identifier Nothing _) = True
+> tokensWillPrintAndLex _ (Identifier (Just(_,b)) _) (Identifier (Just(c,_)) _) =
+>      not (b == c)
+> tokensWillPrintAndLex _ Identifier {} HostParam {} = True
+> tokensWillPrintAndLex _ Identifier {} PositionalArg {} = True
+> tokensWillPrintAndLex _ (Identifier Nothing _) (SqlString a _ _) =
+>     case a of
+>         (a':_) | isAlpha a' -> False
+>         _ -> True

-extended operator rules
-
-$1 positional parameter
-()
-[]
-,
-;
-: array slices and variable names/hostparam
-*
-.
-
-some operator precedence notes
-
-SELECT 3 OPERATOR(pg_catalog.+) 4;
-
-diff from ansi:
-
-all the same symbols + more + different rules about parsing multi char
-symbols (ansi is trivial here, postgresql is not trivial and
-extensible)
-
-identifiers: same, doublecheck the u&
-hostparam: same, but with implementation issues because : is also a
-  symbol in postgresql. this might be a little tricky to deal with
-
-string literals:
-u&?, does pg support n?
-
-numbers: same
-whitespace, comments: same
-
-make sure there is a list of lexical syntax which is valid in postgres
-and not in ansi, and vice versa, and have explicit tests for
-these. There might also be situations here where a string is valid in
-both, but lexes differently. There is definitely cases like this in
-the main syntax.
+> tokensWillPrintAndLex _ Identifier {} SqlString {} = True
+> tokensWillPrintAndLex _ (Identifier Nothing _) (SqlNumber s) =
+>     case s of
+>         (s':_) -> not (isDigit s')
+>         _ -> True
+> tokensWillPrintAndLex _ Identifier {} SqlNumber {} = True


-action plan:
-no abstract syntax changes are needed
-write down a spec for ansi and for postgresql lexical syntax
-create a list of tests for postgresql
-  include eveything from ansi which is the same: maybe refactor the
-  tests to make this maintainable

-design for escaping issues
-  (affects ansi also)
-design for string literal-like syntax and for continuation strings
-  (affects ansi also)
+> tokensWillPrintAndLex _ HostParam {} Symbol {} = True
+> tokensWillPrintAndLex _ HostParam {} (Identifier Nothing _) = False
+> tokensWillPrintAndLex _ HostParam {} (Identifier (Just (a,_)) _) =
+>     case a of
+>         c:_ -> not (isAlpha c)
+>         [] -> False

-the test approach in general is first to parse basic examples of each
-kind of token, then to manually come up with some edge cases to test,
-and then to generate a good representative set of tokens (probably the
-same set as the previous two categories), and create the cross product
-of pairs of these tokens, eliminate ones when the tokens are next to
-each other and it doesn't parse as the two separate tokens, using
-manually written rules (want to be super accurate here - no false
-positives or negatives), then test these all parse good as
-well. Separating out the lexing in this way and doing this approach I
-think gives a very good chance of minimising bugs in the basic
-parsing, especially in the hairy bits.
+> tokensWillPrintAndLex _ HostParam {} HostParam {} = True
+> tokensWillPrintAndLex _ HostParam {} PositionalArg {} = True
+> tokensWillPrintAndLex _ HostParam {} (SqlString a _ _) =
+>     case a of
+>         (a':_) | isAlpha a' -> False
+>         _ -> True
+> tokensWillPrintAndLex _ HostParam {} (SqlNumber s) =
+>     case s of
+>         (s':_) -> not (isDigit s')
+>         _ -> True

+> tokensWillPrintAndLex _ PositionalArg {} Symbol {} = True
+> tokensWillPrintAndLex _ PositionalArg {} Identifier {} = True
+> tokensWillPrintAndLex _ PositionalArg {} HostParam {} = True
+> tokensWillPrintAndLex _ PositionalArg {} PositionalArg {} = True
+> tokensWillPrintAndLex _ PositionalArg {} SqlString {} = True -- todo: think carefully about dollar quoting?
+> tokensWillPrintAndLex _ PositionalArg {} (SqlNumber n) =
+>     case n of
+>         (n':_) -> not (isDigit n')
+>         _ -> True

-= lexical syntax
+> tokensWillPrintAndLex _ SqlString {} Symbol {} = True
+> tokensWillPrintAndLex _ SqlString {} Identifier {} = True
+> tokensWillPrintAndLex _ SqlString {} HostParam {} = True
+> tokensWillPrintAndLex _ SqlString {} PositionalArg {} = True

-One possible gotcha: there isn't a one-one correpsondence between e.g
-identifiers and string literals in the lexical syntax, and identifiers
-and string literals in the main syntax.
+> tokensWillPrintAndLex _ (SqlString _q00 q01 _s0) (SqlString q10 _q11 _s1) =
+>     not (q01 == "'" && q10 == "'")

-== ansi
+> tokensWillPrintAndLex _ SqlString {} SqlNumber {} = True

-=== symbol
-=== identifier
-+ escaping
-=== quoted identifier
-+ escaping, prefixes
-=== host param
+> tokensWillPrintAndLex _ SqlNumber {} (Symbol ('.':_)) = False
+> tokensWillPrintAndLex _ SqlNumber {} Symbol {} = True
+> tokensWillPrintAndLex _ SqlNumber {} Identifier {} = True
+> tokensWillPrintAndLex _ SqlNumber {} HostParam {} = True
+> tokensWillPrintAndLex _ SqlNumber {} PositionalArg {} = True

-=== string literal-like
-+ escaping, prefixes
+todo: check for failures when e following number is fixed

-=== number literals
+> tokensWillPrintAndLex _ SqlNumber {} (SqlString ('e':_) _ _)  = False
+> tokensWillPrintAndLex _ SqlNumber {} (SqlString ('E':_) _ _)  = False
+> tokensWillPrintAndLex _ SqlNumber {} SqlString {}  = True

-=== whitespace
+> tokensWillPrintAndLex _ (SqlNumber _) (SqlNumber _) = False

-=== comments
+todo: special case lexer so a second ., and . and e are not
+allowed after exponent when there is no whitespace, even if there
+is an unambiguous parse

-== postgresql
+TODO:

-=== symbol
+lex @variable in sql server
+lex [quoted identifier] in sql server
+lex #variable in oracle

-=== identifier
+make a new ctor for @var, #var

-== postgresql
+add token tables and tests for oracle, sql server

-=== symbol
-extended set of symbols + extensibility + special cases
-: is a symbol and also part of host param
+add odbc as a dialect flag and include {} as symbols when enabled

-=== identifier
+add negative / different parse dialect tests

-same as ansi? is the character set the same?
+refactor the tokenswillprintlex to be based on pretty printing the
+individual tokens

-=== quoted identifier
+add special cases to lexer:
+||| three or more pipes is a syntax error (except for postgres)
+    (and not ||,| or |,|| e.g.)
+extra ., e after number without whitespace is a syntax error

-same as ansi?
+reject a */ pair of chars instead of parsing it as a symbol or several
+symbols. Can this appear inside an operator in postgres? Let's make it
+not possible anywhere since this could be really confusing.

-=== host param
-
-same as ansi (check char set)
-
-=== string literal-like
-
-dollar quoting
-E quoting
-missing n'?
-
-
-=== number literals
-
-same as ansi, i think
-
-=== whitespace
-
-same as ansi
-
-=== comments
-
-same as ansi
-
-=== additions
-$1 positional parameter
-
---- find what else is in hssqlppp to support mysql, oracle, sql
-     server
+do some user documentation on lexing, and lexing/dialects

+start thinking about a more separated design for the dialect handling
--- a/tools/Language/SQL/SimpleSQL/LexerTests.lhs
+++ b/tools/Language/SQL/SimpleSQL/LexerTests.lhs
@ -5,15 +5,18 @@ Test for the lexer
 > module Language.SQL.SimpleSQL.LexerTests (lexerTests) where

 > import Language.SQL.SimpleSQL.TestTypes
-> import Language.SQL.SimpleSQL.Lex (Token(..))
+> import Language.SQL.SimpleSQL.Lex (Token(..),tokenListWillPrintAndLex)
 > --import Debug.Trace
-> import Data.Char (isAlpha)
+> --import Data.Char (isAlpha)
 > import Data.List

 > lexerTests :: TestItem
 > lexerTests = Group "lexerTests" $
 >     [Group "lexer token tests" [ansiLexerTests
->                                ,postgresLexerTests]]
+>                                ,postgresLexerTests
+>                                ,sqlServerLexerTests
+>                                ,oracleLexerTests
+>                                ,odbcLexerTests]]

 > ansiLexerTable :: [(String,[Token])]
 > ansiLexerTable =
@ -70,22 +73,7 @@ Test for the lexer
 >     [ LexerTest ansi2011 (s ++ s1) (t ++ t1)
 >     | (s,t) <- ansiLexerTable
 >     , (s1,t1) <- ansiLexerTable
-
-which combinations won't work:
-<> <= >= || two single symbols which make a double char symbol
-identifier + identifier if both are quoted or unquoted
-string string
-csstring string
-line comment anything (can add newline?)
-number number (todo: double check more carefully)
-
-new idea:
-
-create a function which will say if a series of lexical tokens should
-survive the pretty -> lex cycle unchanged. Maybe this will lead to
-clearer/more maintainable code.
-
->     , isGood $ t ++ t1
+>     , tokenListWillPrintAndLex ansi2011 $ t ++ t1

 >     ]
 >     ,Group "adhoc lexer tests" $
@ -95,73 +83,6 @@ clearer/more maintainable code.
 >        ]
 >      ]

->  where
->    isGood :: [Token] -> Bool
->    isGood l = {-let b =-} and $ map not [p l | p <- map listPred badCombos]
->               -- in trace ("isGood " ++ show (l,b)) b
->    badCombos :: [((Token -> Bool),(Token -> Bool))]
->    badCombos = [symbolPair "<" ">"
->                ,symbolPair "<" "="
->                ,symbolPair ">" "="
->                ,symbolPair "!" "="
->                ,symbolPair "|" "|"
->                ,symbolPair "||" "|"
->                ,symbolPair "|" "||"
->                ,symbolPair "||" "||"
->                ,symbolPair "<" ">="
-
->                ,symbolPair "-" "-"
->                ,symbolPair "/" "*"
->                ,symbolPair "*" "/"
-
->                ,(isIdentifier, isIdentifier)
->                ,(isDQIdentifier, isDQIdentifier)
->                ,(isCQIdentifier, isDQIdentifier)
->                ,(isString, isNonCsString)
->                ,(isEofLineComment, const True)
->                ,(isNumber, isNumber)
->                ,(isHostParam,isIdentifier)
->                ,(isHostParam,isCsString)
->                ,(isHostParam,isCQIdentifier)
->                ,(isIdentifier,isCsString)
->                ,(isIdentifier,isCQIdentifier)
->                ,(isWhitespace, isWhitespace)
->                ,(isIdentifier, isNumber)
->                ,(isHostParam, isNumber)
->                ,(isMinus, isLineComment)
->                ]
->    isIdentifier (Identifier Nothing _) = True
->    isIdentifier _ = False
->    isDQIdentifier (Identifier (Just ("\"",_)) _) = True
->    isDQIdentifier _ = False
->    isCQIdentifier (Identifier (Just ((x:_),_)) _) | isAlpha x = True
->    isCQIdentifier _ = False
->    isCsString (SqlString (x:_) _ _) | isAlpha x = True
->    isCsString _ = False
->    isString (SqlString _ _ _) = True
->    isString _ = False
->    isNonCsString (SqlString [] _ _) = True
->    isNonCsString (SqlString (x:_) _ _) | not (isAlpha x) = True
->    isNonCsString _ = False
->    isEofLineComment (LineComment s) = last s /= '\n'
->    isEofLineComment _ = False
->    isLineComment (LineComment {}) = True
->    isLineComment _ = False
->    isNumber (SqlNumber{}) = True
->    isNumber _ = False
->    isHostParam (HostParam{}) = True
->    isHostParam _ = False
->    isWhitespace (Whitespace{}) = True
->    isWhitespace _ = False
->    isMinus (Symbol "-") = True
->    isMinus _ = False
->    symbolPair a b = ((==Symbol a), (==Symbol b))
->    listPred :: ((Token -> Bool),(Token -> Bool)) -> [Token] -> Bool
->    listPred _ [] = False
->    listPred _ [_] = False
->    listPred (p,p1) (t:t1:ts) | p t && p1 t1 = True
->                              | otherwise = listPred (p,p1) (t1:ts)
-
 todo: lexing tests
 do quickcheck testing:
 can try to generate valid tokens then check they parse
@ -202,7 +123,9 @@ operators without one of the exception chars

 also: do the testing for the ansi compatibility special cases

->     ++ [ (x, [Symbol x]) | x <- someValidPostgresOperators 4]
+>     ++ [ (x, [Symbol x]) | x <- someValidPostgresOperators 2]
+
+
 >     ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"]
 >         -- simple identifiers
 >         in map (\i -> (i, [Identifier Nothing i])) idens
@ -254,6 +177,11 @@ also: do the testing for the ansi compatibility special cases
 >        ,"/* this *is/ a comment */"
 >        ]

+> postgresExtraOperatorTable :: [(String,[Token])]
+> postgresExtraOperatorTable =
+>     [ (x, [Symbol x]) | x <- someValidPostgresOperators 4]
+
+
 > someValidPostgresOperators :: Int -> [String]
 > someValidPostgresOperators l =
 >        [ x
@ -281,8 +209,15 @@ the + or -.
 > postgresLexerTests :: TestItem
 > postgresLexerTests = Group "postgresLexerTests" $
 >     [Group "postgres lexer token tests" $
->      [LexerTest postgres s t | (s,t) <- postgresLexerTable]
->     ,Group "adhoc lexer tests" $
+>      [LexerTest postgres s t | (s,t) <- postgresLexerTable ++ postgresExtraOperatorTable]
+>     ,Group "postgres generated combination lexer tests" $
+>     [ LexerTest postgres (s ++ s1) (t ++ t1)
+>     | (s,t) <- postgresLexerTable
+>     , (s1,t1) <- postgresLexerTable
+>     , tokenListWillPrintAndLex postgres $ t ++ t1
+
+>     ]
+>     ,Group "adhoc postgres lexer tests" $
 >      [LexerTest postgres s t
 >      | (s,t) <- edgeCaseCommentOps
 >                 ++ edgeCasePlusMinusOps
@ -308,6 +243,36 @@ the + or -.
 >      ]


+> sqlServerLexerTests :: TestItem
+> sqlServerLexerTests = Group "sqlServerLexerTests" $
+>     [ LexerTest sqlserver s t | (s,t) <-
+>     [--("@variable", [(Identifier (Just ("@", "")) "variable")])
+>     --,("[quoted identifier]", [(Identifier (Just ("[", "]")) "variable")])
+>     ]]
+
+> oracleLexerTests :: TestItem
+> oracleLexerTests = Group "oracleLexerTests" $
+>     [ LexerTest oracle s t | (s,t) <-
+>     [--("#variable", [(Identifier (Just ("#", "")) "variable")])
+>     ]
+>     ]
+
+> odbcLexerTests :: TestItem
+> odbcLexerTests = Group "odbcLexerTests" $
+>     [ LexerTest sqlserver {- {odbc = True} -} s t | (s,t) <-
+>     [--("{}", [Symbol "{", Symbol "}"])
+>     ]
+>     ]
+
 > combos :: [a] -> Int -> [[a]]
 > combos _ 0 = [[]]
 > combos l n = [ x:tl | x <- l, tl <- combos l (n - 1) ]
+
+figure out a way to do quickcheck testing:
+1. generate valid tokens and check they parse
+
+2. combine two generated tokens together for the combo testing
+
+this especially will work much better for the postgresql extensible
+operator tests which doing exhaustively takes ages and doesn't bring
+much benefit over testing a few using quickcheck.