diff --git a/Language/SQL/SimpleSQL/Lex.lhs b/Language/SQL/SimpleSQL/Lex.lhs index 7fb0bf0..302404b 100644 --- a/Language/SQL/SimpleSQL/Lex.lhs +++ b/Language/SQL/SimpleSQL/Lex.lhs @@ -44,7 +44,10 @@ todo: public documentation on dialect definition - and dialect flags > ,prettyToken > ,prettyTokens > ,ParseError(..) -> ,Dialect(..)) where +> ,Dialect(..) +> ,tokensWillPrintAndLex +> ,tokenListWillPrintAndLex +> ) where > import Language.SQL.SimpleSQL.Dialect @@ -68,7 +71,7 @@ todo: public documentation on dialect definition - and dialect flags > -- | Represents a lexed token > data Token -> -- | A symbol is one of the following +> -- | A symbol (in ansi dialect) is one of the following > -- > -- * multi char symbols <> <= >= != || > -- * single char symbols: * + - < > ^ / % ~ & | ? ( ) [ ] , ; ( ) @@ -404,6 +407,18 @@ which allows the last character of a multi character symbol to be + or > <*> option [] opMoreChars > ] +> symbol d | diSyntaxFlavour d == SQLServer = +> Symbol <$> choice (otherSymbol ++ regularOp) +> where +> otherSymbol = many1 (char '.') : +> map (string . (:[])) ",;():?" + +try is used because most of the first characters of the two character +symbols can also be part of a single character symbol + +> regularOp = map (try . string) [">=","<=","!=","<>","||"] +> ++ map (string . (:[])) "+-^*/%~&|<>=" + > symbol _ = > Symbol <$> choice (otherSymbol ++ regularOp) @@ -477,190 +492,246 @@ Some helper combinators > takeWhile p = many (satisfy p) > takeTill :: (Char -> Bool) -> Parser String -> takeTill p = -> manyTill anyChar (peekSatisfy p) +> takeTill p = manyTill anyChar (peekSatisfy p) > peekSatisfy :: (Char -> Bool) -> Parser () -> peekSatisfy p = do -> void $ lookAhead (satisfy p) +> peekSatisfy p = void $ lookAhead (satisfy p) + +This utility function will accurately report if the two tokens are +pretty printed, if they should lex back to the same two tokens. This +function is used in testing (and can be used in other places), and +must not be implemented by actually trying to print and then lex +(because then we would have the risk of thinking two tokens cannot be +together when there is bug in the lexer and it should be possible to +put them together. + +question: maybe pretty printing the tokens separately and then +analysing the concrete syntax without concatting the two printed +tokens together is a better way of doing this? + +maybe do some quick checking to make sure this function only gives +true negatives: check pairs which return false actually fail to lex or +give different symbols in return + +> tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool +> tokenListWillPrintAndLex _ [] = True +> tokenListWillPrintAndLex _ [_] = True +> tokenListWillPrintAndLex d (a:b:xs) = +> tokensWillPrintAndLex d a b && tokenListWillPrintAndLex d (b:xs) + +> tokensWillPrintAndLex :: Dialect -> Token -> Token -> Bool + +> tokensWillPrintAndLex d (Symbol ":") x = +> case prettyToken d x of +> -- eliminate cases: +> -- first letter of pretty x can be start of identifier +> -- this will look like a hostparam +> -- first letter of x is :, this will look like :: +> -- first letter of x is =, this will look like := +> (a:_) | a `elem` ":_=" || isAlpha a -> False +> _ -> True + +two symbols next to eachother will fail if the symbols can combine and +(possibly just the prefix) look like a different symbol, or if they +combine to look like comment markers -postgresql notes: -u& -SELECT 'foo' -'bar'; -is equivalent to: -SELECT 'foobar'; -SELECT 'foo' 'bar'; -is invalid +> tokensWillPrintAndLex d a@(Symbol {}) b@(Symbol {}) +> | a'@(_:_) <- prettyToken d a +> , ('-':_) <- prettyToken d b +> , last a' == '-' = False -(this should be in ansi also) +> tokensWillPrintAndLex (Dialect {diSyntaxFlavour = Postgres}) (Symbol a) (Symbol x) = +> (x `elem` ["+", "-"]) +> && and (map (`notElem` a) "~!@#%^&|`?") -definitely do major review and docs: -when can escapes and prefixes be using with syntactic string literals -when can they be combined -when can e.g. dollar quoting be used -what escaping should there be, including unicode escapes +> tokensWillPrintAndLex _ (Symbol s1) (Symbol s2) = +> (s1,s2) `notElem` +> [("<",">") +> ,("<","=") +> ,(">","=") +> ,("!","=") +> ,("|","|") +> -- todo: make the lexer special case reject ||| and so on +> -- because it is ambiguous +> --,("||","|") +> ,("|","||") +> --,("||","||") +> ,("<",">=") +> ,("-","-") +> ,("/","*") +> -- todo: special case make the lexer reject */ next to eachother +> -- because trying to interpret it as *,/ will lead to much worse error messages +> --,("*","/") +> ] + +two whitespaces will be combined + +> tokensWillPrintAndLex _ Whitespace {} Whitespace {} = False + +line comment without a newline at the end will eat the next token + +> tokensWillPrintAndLex _ (LineComment s@(_:_)) _ = last s == '\n' + +this should never happen, but the case satisfies the haskell compiler +and isn't exactly wrong + +> tokensWillPrintAndLex _ (LineComment []) _ = False + +apart from two above cases, leading and trailing whitespace will always be ok + +> tokensWillPrintAndLex _ Whitespace {} _ = True +> tokensWillPrintAndLex _ _ Whitespace {} = True + +a symbol ending with a '-' followed by a line comment will lex back +differently, since the --- will combine and move the comment eating +some of the symbol + +> tokensWillPrintAndLex _ (Symbol s) (LineComment {}) = +> case s of +> (_:_) -> last s /= '-' +> _ -> True + +in other situations a trailing line comment or a leading or trailing +block comment will work + +> tokensWillPrintAndLex _ _ LineComment {} = True +> tokensWillPrintAndLex _ _ BlockComment {} = True +> tokensWillPrintAndLex _ BlockComment {} _ = True -E'string' -with a range of escapes which should appear in the dialect data type +> tokensWillPrintAndLex _ Symbol {} Identifier {} = True -dollar quoted strings -never with prefixes/escapes - -B'' -X'' - -numbers - -:: cast +> tokensWillPrintAndLex _ Symbol {} HostParam {} = True +> tokensWillPrintAndLex _ Symbol {} PositionalArg {} = True +> tokensWillPrintAndLex _ Symbol {} SqlString {} = True +> tokensWillPrintAndLex (Dialect {diSyntaxFlavour = Postgres}) Symbol {} (SqlNumber ('.':_)) = False +> tokensWillPrintAndLex _ Symbol {} SqlNumber {} = True -type 'string' - literals only, not array types - ansi allows for some specific types -'string'::type -cast('string' as type) +identifier: + symbol ok + identifier: + alphas then alphas: bad + quote then quote (with same start and end quote): bad + quote [ ] then quote [ ]: ok? this technically works, not sure if + it is a good ui, or requiring whitepace/comment is better. See + what sql server does + second is quote with prefix: makes it ok + host param: ok, but maybe should require whitespace for ui reasons + positional arg: ok, but maybe should require whitespace for ui reasons + string: ok, but maybe should require whitespace for ui reasons + number: ok, but maybe should require whitespace for ui reasons -can use dollar quoting here -typename('string') (not all types) -check these in the parser for keyword issues +> tokensWillPrintAndLex _ Identifier {} Symbol {} = True +> tokensWillPrintAndLex _ (Identifier Nothing _) (Identifier Nothing _) = False +> tokensWillPrintAndLex _ (Identifier Nothing _) (Identifier (Just (a,_)) _) = +> case a of +> (a':_) | isAlpha a' -> False +> _ -> True +> tokensWillPrintAndLex _ (Identifier Just {} _) (Identifier Nothing _) = True +> tokensWillPrintAndLex _ (Identifier (Just(_,b)) _) (Identifier (Just(c,_)) _) = +> not (b == c) +> tokensWillPrintAndLex _ Identifier {} HostParam {} = True +> tokensWillPrintAndLex _ Identifier {} PositionalArg {} = True +> tokensWillPrintAndLex _ (Identifier Nothing _) (SqlString a _ _) = +> case a of +> (a':_) | isAlpha a' -> False +> _ -> True -extended operator rules - -$1 positional parameter -() -[] -, -; -: array slices and variable names/hostparam -* -. - -some operator precedence notes - -SELECT 3 OPERATOR(pg_catalog.+) 4; - -diff from ansi: - -all the same symbols + more + different rules about parsing multi char -symbols (ansi is trivial here, postgresql is not trivial and -extensible) - -identifiers: same, doublecheck the u& -hostparam: same, but with implementation issues because : is also a - symbol in postgresql. this might be a little tricky to deal with - -string literals: -u&?, does pg support n? - -numbers: same -whitespace, comments: same - -make sure there is a list of lexical syntax which is valid in postgres -and not in ansi, and vice versa, and have explicit tests for -these. There might also be situations here where a string is valid in -both, but lexes differently. There is definitely cases like this in -the main syntax. +> tokensWillPrintAndLex _ Identifier {} SqlString {} = True +> tokensWillPrintAndLex _ (Identifier Nothing _) (SqlNumber s) = +> case s of +> (s':_) -> not (isDigit s') +> _ -> True +> tokensWillPrintAndLex _ Identifier {} SqlNumber {} = True -action plan: -no abstract syntax changes are needed -write down a spec for ansi and for postgresql lexical syntax -create a list of tests for postgresql - include eveything from ansi which is the same: maybe refactor the - tests to make this maintainable -design for escaping issues - (affects ansi also) -design for string literal-like syntax and for continuation strings - (affects ansi also) +> tokensWillPrintAndLex _ HostParam {} Symbol {} = True +> tokensWillPrintAndLex _ HostParam {} (Identifier Nothing _) = False +> tokensWillPrintAndLex _ HostParam {} (Identifier (Just (a,_)) _) = +> case a of +> c:_ -> not (isAlpha c) +> [] -> False -the test approach in general is first to parse basic examples of each -kind of token, then to manually come up with some edge cases to test, -and then to generate a good representative set of tokens (probably the -same set as the previous two categories), and create the cross product -of pairs of these tokens, eliminate ones when the tokens are next to -each other and it doesn't parse as the two separate tokens, using -manually written rules (want to be super accurate here - no false -positives or negatives), then test these all parse good as -well. Separating out the lexing in this way and doing this approach I -think gives a very good chance of minimising bugs in the basic -parsing, especially in the hairy bits. +> tokensWillPrintAndLex _ HostParam {} HostParam {} = True +> tokensWillPrintAndLex _ HostParam {} PositionalArg {} = True +> tokensWillPrintAndLex _ HostParam {} (SqlString a _ _) = +> case a of +> (a':_) | isAlpha a' -> False +> _ -> True +> tokensWillPrintAndLex _ HostParam {} (SqlNumber s) = +> case s of +> (s':_) -> not (isDigit s') +> _ -> True +> tokensWillPrintAndLex _ PositionalArg {} Symbol {} = True +> tokensWillPrintAndLex _ PositionalArg {} Identifier {} = True +> tokensWillPrintAndLex _ PositionalArg {} HostParam {} = True +> tokensWillPrintAndLex _ PositionalArg {} PositionalArg {} = True +> tokensWillPrintAndLex _ PositionalArg {} SqlString {} = True -- todo: think carefully about dollar quoting? +> tokensWillPrintAndLex _ PositionalArg {} (SqlNumber n) = +> case n of +> (n':_) -> not (isDigit n') +> _ -> True -= lexical syntax +> tokensWillPrintAndLex _ SqlString {} Symbol {} = True +> tokensWillPrintAndLex _ SqlString {} Identifier {} = True +> tokensWillPrintAndLex _ SqlString {} HostParam {} = True +> tokensWillPrintAndLex _ SqlString {} PositionalArg {} = True -One possible gotcha: there isn't a one-one correpsondence between e.g -identifiers and string literals in the lexical syntax, and identifiers -and string literals in the main syntax. +> tokensWillPrintAndLex _ (SqlString _q00 q01 _s0) (SqlString q10 _q11 _s1) = +> not (q01 == "'" && q10 == "'") -== ansi +> tokensWillPrintAndLex _ SqlString {} SqlNumber {} = True -=== symbol -=== identifier -+ escaping -=== quoted identifier -+ escaping, prefixes -=== host param +> tokensWillPrintAndLex _ SqlNumber {} (Symbol ('.':_)) = False +> tokensWillPrintAndLex _ SqlNumber {} Symbol {} = True +> tokensWillPrintAndLex _ SqlNumber {} Identifier {} = True +> tokensWillPrintAndLex _ SqlNumber {} HostParam {} = True +> tokensWillPrintAndLex _ SqlNumber {} PositionalArg {} = True -=== string literal-like -+ escaping, prefixes +todo: check for failures when e following number is fixed -=== number literals +> tokensWillPrintAndLex _ SqlNumber {} (SqlString ('e':_) _ _) = False +> tokensWillPrintAndLex _ SqlNumber {} (SqlString ('E':_) _ _) = False +> tokensWillPrintAndLex _ SqlNumber {} SqlString {} = True -=== whitespace +> tokensWillPrintAndLex _ (SqlNumber _) (SqlNumber _) = False -=== comments +todo: special case lexer so a second ., and . and e are not +allowed after exponent when there is no whitespace, even if there +is an unambiguous parse -== postgresql +TODO: -=== symbol +lex @variable in sql server +lex [quoted identifier] in sql server +lex #variable in oracle -=== identifier +make a new ctor for @var, #var -== postgresql +add token tables and tests for oracle, sql server -=== symbol -extended set of symbols + extensibility + special cases -: is a symbol and also part of host param +add odbc as a dialect flag and include {} as symbols when enabled -=== identifier +add negative / different parse dialect tests -same as ansi? is the character set the same? +refactor the tokenswillprintlex to be based on pretty printing the +individual tokens -=== quoted identifier +add special cases to lexer: +||| three or more pipes is a syntax error (except for postgres) + (and not ||,| or |,|| e.g.) +extra ., e after number without whitespace is a syntax error -same as ansi? +reject a */ pair of chars instead of parsing it as a symbol or several +symbols. Can this appear inside an operator in postgres? Let's make it +not possible anywhere since this could be really confusing. -=== host param - -same as ansi (check char set) - -=== string literal-like - -dollar quoting -E quoting -missing n'? - - -=== number literals - -same as ansi, i think - -=== whitespace - -same as ansi - -=== comments - -same as ansi - -=== additions -$1 positional parameter - ----- find what else is in hssqlppp to support mysql, oracle, sql - server +do some user documentation on lexing, and lexing/dialects +start thinking about a more separated design for the dialect handling diff --git a/tools/Language/SQL/SimpleSQL/LexerTests.lhs b/tools/Language/SQL/SimpleSQL/LexerTests.lhs index 8f6f338..3737091 100644 --- a/tools/Language/SQL/SimpleSQL/LexerTests.lhs +++ b/tools/Language/SQL/SimpleSQL/LexerTests.lhs @@ -5,15 +5,18 @@ Test for the lexer > module Language.SQL.SimpleSQL.LexerTests (lexerTests) where > import Language.SQL.SimpleSQL.TestTypes -> import Language.SQL.SimpleSQL.Lex (Token(..)) +> import Language.SQL.SimpleSQL.Lex (Token(..),tokenListWillPrintAndLex) > --import Debug.Trace -> import Data.Char (isAlpha) +> --import Data.Char (isAlpha) > import Data.List > lexerTests :: TestItem > lexerTests = Group "lexerTests" $ > [Group "lexer token tests" [ansiLexerTests -> ,postgresLexerTests]] +> ,postgresLexerTests +> ,sqlServerLexerTests +> ,oracleLexerTests +> ,odbcLexerTests]] > ansiLexerTable :: [(String,[Token])] > ansiLexerTable = @@ -70,22 +73,7 @@ Test for the lexer > [ LexerTest ansi2011 (s ++ s1) (t ++ t1) > | (s,t) <- ansiLexerTable > , (s1,t1) <- ansiLexerTable - -which combinations won't work: -<> <= >= || two single symbols which make a double char symbol -identifier + identifier if both are quoted or unquoted -string string -csstring string -line comment anything (can add newline?) -number number (todo: double check more carefully) - -new idea: - -create a function which will say if a series of lexical tokens should -survive the pretty -> lex cycle unchanged. Maybe this will lead to -clearer/more maintainable code. - -> , isGood $ t ++ t1 +> , tokenListWillPrintAndLex ansi2011 $ t ++ t1 > ] > ,Group "adhoc lexer tests" $ @@ -95,73 +83,6 @@ clearer/more maintainable code. > ] > ] -> where -> isGood :: [Token] -> Bool -> isGood l = {-let b =-} and $ map not [p l | p <- map listPred badCombos] -> -- in trace ("isGood " ++ show (l,b)) b -> badCombos :: [((Token -> Bool),(Token -> Bool))] -> badCombos = [symbolPair "<" ">" -> ,symbolPair "<" "=" -> ,symbolPair ">" "=" -> ,symbolPair "!" "=" -> ,symbolPair "|" "|" -> ,symbolPair "||" "|" -> ,symbolPair "|" "||" -> ,symbolPair "||" "||" -> ,symbolPair "<" ">=" - -> ,symbolPair "-" "-" -> ,symbolPair "/" "*" -> ,symbolPair "*" "/" - -> ,(isIdentifier, isIdentifier) -> ,(isDQIdentifier, isDQIdentifier) -> ,(isCQIdentifier, isDQIdentifier) -> ,(isString, isNonCsString) -> ,(isEofLineComment, const True) -> ,(isNumber, isNumber) -> ,(isHostParam,isIdentifier) -> ,(isHostParam,isCsString) -> ,(isHostParam,isCQIdentifier) -> ,(isIdentifier,isCsString) -> ,(isIdentifier,isCQIdentifier) -> ,(isWhitespace, isWhitespace) -> ,(isIdentifier, isNumber) -> ,(isHostParam, isNumber) -> ,(isMinus, isLineComment) -> ] -> isIdentifier (Identifier Nothing _) = True -> isIdentifier _ = False -> isDQIdentifier (Identifier (Just ("\"",_)) _) = True -> isDQIdentifier _ = False -> isCQIdentifier (Identifier (Just ((x:_),_)) _) | isAlpha x = True -> isCQIdentifier _ = False -> isCsString (SqlString (x:_) _ _) | isAlpha x = True -> isCsString _ = False -> isString (SqlString _ _ _) = True -> isString _ = False -> isNonCsString (SqlString [] _ _) = True -> isNonCsString (SqlString (x:_) _ _) | not (isAlpha x) = True -> isNonCsString _ = False -> isEofLineComment (LineComment s) = last s /= '\n' -> isEofLineComment _ = False -> isLineComment (LineComment {}) = True -> isLineComment _ = False -> isNumber (SqlNumber{}) = True -> isNumber _ = False -> isHostParam (HostParam{}) = True -> isHostParam _ = False -> isWhitespace (Whitespace{}) = True -> isWhitespace _ = False -> isMinus (Symbol "-") = True -> isMinus _ = False -> symbolPair a b = ((==Symbol a), (==Symbol b)) -> listPred :: ((Token -> Bool),(Token -> Bool)) -> [Token] -> Bool -> listPred _ [] = False -> listPred _ [_] = False -> listPred (p,p1) (t:t1:ts) | p t && p1 t1 = True -> | otherwise = listPred (p,p1) (t1:ts) - todo: lexing tests do quickcheck testing: can try to generate valid tokens then check they parse @@ -202,7 +123,9 @@ operators without one of the exception chars also: do the testing for the ansi compatibility special cases -> ++ [ (x, [Symbol x]) | x <- someValidPostgresOperators 4] +> ++ [ (x, [Symbol x]) | x <- someValidPostgresOperators 2] + + > ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"] > -- simple identifiers > in map (\i -> (i, [Identifier Nothing i])) idens @@ -254,6 +177,11 @@ also: do the testing for the ansi compatibility special cases > ,"/* this *is/ a comment */" > ] +> postgresExtraOperatorTable :: [(String,[Token])] +> postgresExtraOperatorTable = +> [ (x, [Symbol x]) | x <- someValidPostgresOperators 4] + + > someValidPostgresOperators :: Int -> [String] > someValidPostgresOperators l = > [ x @@ -281,8 +209,15 @@ the + or -. > postgresLexerTests :: TestItem > postgresLexerTests = Group "postgresLexerTests" $ > [Group "postgres lexer token tests" $ -> [LexerTest postgres s t | (s,t) <- postgresLexerTable] -> ,Group "adhoc lexer tests" $ +> [LexerTest postgres s t | (s,t) <- postgresLexerTable ++ postgresExtraOperatorTable] +> ,Group "postgres generated combination lexer tests" $ +> [ LexerTest postgres (s ++ s1) (t ++ t1) +> | (s,t) <- postgresLexerTable +> , (s1,t1) <- postgresLexerTable +> , tokenListWillPrintAndLex postgres $ t ++ t1 + +> ] +> ,Group "adhoc postgres lexer tests" $ > [LexerTest postgres s t > | (s,t) <- edgeCaseCommentOps > ++ edgeCasePlusMinusOps @@ -308,6 +243,36 @@ the + or -. > ] +> sqlServerLexerTests :: TestItem +> sqlServerLexerTests = Group "sqlServerLexerTests" $ +> [ LexerTest sqlserver s t | (s,t) <- +> [--("@variable", [(Identifier (Just ("@", "")) "variable")]) +> --,("[quoted identifier]", [(Identifier (Just ("[", "]")) "variable")]) +> ]] + +> oracleLexerTests :: TestItem +> oracleLexerTests = Group "oracleLexerTests" $ +> [ LexerTest oracle s t | (s,t) <- +> [--("#variable", [(Identifier (Just ("#", "")) "variable")]) +> ] +> ] + +> odbcLexerTests :: TestItem +> odbcLexerTests = Group "odbcLexerTests" $ +> [ LexerTest sqlserver {- {odbc = True} -} s t | (s,t) <- +> [--("{}", [Symbol "{", Symbol "}"]) +> ] +> ] + > combos :: [a] -> Int -> [[a]] > combos _ 0 = [[]] > combos l n = [ x:tl | x <- l, tl <- combos l (n - 1) ] + +figure out a way to do quickcheck testing: +1. generate valid tokens and check they parse + +2. combine two generated tokens together for the combo testing + +this especially will work much better for the postgresql extensible +operator tests which doing exhaustively takes ages and doesn't bring +much benefit over testing a few using quickcheck.