From c24008444c8c2895c162c98a145afd8fa23b3fd6 Mon Sep 17 00:00:00 2001 From: Jake Wheat Date: Sat, 13 Feb 2016 20:28:12 +0200 Subject: [PATCH] work on postgres operator parsing --- Language/SQL/SimpleSQL/Lex.lhs | 111 +++++++++++++++++--- tools/Language/SQL/SimpleSQL/LexerTests.lhs | 78 +++++++++++++- 2 files changed, 171 insertions(+), 18 deletions(-) diff --git a/Language/SQL/SimpleSQL/Lex.lhs b/Language/SQL/SimpleSQL/Lex.lhs index 238a889..3a2825a 100644 --- a/Language/SQL/SimpleSQL/Lex.lhs +++ b/Language/SQL/SimpleSQL/Lex.lhs @@ -54,7 +54,8 @@ todo: public documentation on dialect definition - and dialect flags > ,setPosition,getPosition > ,setSourceColumn,setSourceLine > ,sourceName, setSourceName -> ,sourceLine, sourceColumn) +> ,sourceLine, sourceColumn +> ,notFollowedBy) > import Language.SQL.SimpleSQL.Combinators > import Language.SQL.SimpleSQL.Errors > import Control.Applicative hiding ((<|>), many) @@ -329,21 +330,101 @@ character symbols in the two lists below. > symbol :: Dialect -> Parser Token > symbol d | diSyntaxFlavour d == Postgres = -> Symbol <$> -> choice ( -> many1 (char '.') : -> map (try . string) [">=","<=","!=","<>","||", "::", ":="] -> ++ map (string . (:[])) "+-^*/%~&|?<>[]=,;():") +> Symbol <$> choice (otherSymbol ++ [singlePlusMinus,opMoreChars]) + +rules + +An operator name is a sequence of up to NAMEDATALEN-1 (63 by default) characters from the following list: + ++ - * / < > = ~ ! @ # % ^ & | ` ? + +There are a few restrictions on operator names, however: +-- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment. + +A multiple-character operator name cannot end in + or -, unless the name also contains at least one of these characters: + +~ ! @ # % ^ & | ` ? + +> where +> -- other symbols are all the tokens which parse as symbols in +> -- this lexer which aren't considered operators in postgresql +> -- a single ? is parsed as a operator here instead of an other +> -- symbol because this is the least complex way to do it +> otherSymbol = many1 (char '.') : +> (map (try . string) ["::", ":="] +> ++ map (string . (:[])) "[],;():") + +exception char is one of: +~ ! @ # % ^ & | ` ? +which allows the last character of a multi character symbol to be + or +- + +> allOpSymbols = "+-*/<>=~!@#%^&|`?" +> -- all symbols except - and / which can be used to start +> -- a comment token +> allOpSymbolsNoCommentStarters = filter (`notElem` "-/") allOpSymbols +> -- these are the symbols when if part of a multi character +> -- operator permit the operator to end with a + or - symbol +> exceptionOpSymbols = "~!@#%^&|`?" +> +> -- special case for parsing a single + or - symbol +> singlePlusMinus = try $ do +> c <- choice $ map char "+-" +> -- todo: deal with e.g. --- +-- +/* ? +> notFollowedBy $ choice $ map char allOpSymbols +> return [c] + +> -- this is used when we are parsing a potentially multi symbol +> -- operator and we have alread seen one of the 'exception chars' +> -- and so we can end with a + or - +> moreOpCharsException = do +> c <- choice (map char allOpSymbolsNoCommentStarters +> -- make sure we don't parse a comment starting token +> -- as part of an operator +> ++ [try (char '/' <* notFollowedBy (char '*')) +> ,try (char '-' <* notFollowedBy (char '-'))]) +> (c:) <$> option [] moreOpCharsException + +> opMoreChars = choice +> [do +> -- parse an exception char, now we can finish with a + - +> c <- choice $ map char exceptionOpSymbols +> (c:) <$> option [] moreOpCharsException +> ,do +> -- parse + or -, make sure it isn't the last symbol +> c <- try (char '+' +> -- make sure there is another symbol +> <* lookAhead (choice $ map char allOpSymbols)) +> (c:) <$> option [] opMoreChars +> ,do +> c <- try (char '-' +> -- check for comment +> <* notFollowedBy (char '-') +> -- make sure there is another symbol +> <* lookAhead (choice $ map char allOpSymbols)) +> (c:) <$> option [] opMoreChars +> ,do +> -- parse one of the other ansi operator symbols +> c <- choice (-- check / isn't start of comment /* +> try (char '/' <* notFollowedBy (char '*')) +> : map char "*<>=") +> (c:) <$> option [] opMoreChars +> ] + + +> symbol _ = +> Symbol <$> choice (otherSymbol ++ regularOp) +> where +> otherSymbol = many1 (char '.') : +> map (string . (:[])) "[],;():?" + +try is used because most of the first characters of the two character +symbols can also be part of a single character symbol + +> regularOp = map (try . string) [">=","<=","!=","<>","||"] +> ++ map (string . (:[])) "+-^*/%~&|<>=" + -> symbol _ = Symbol <$> -> choice ( -> many1 (char '.') : -> -- try is used because most of the first -> -- characters of the two character symbols -> -- can also be part of a single character symbol -> -- maybe this would be better with left factoring? -> map (try . string) [">=","<=","!=","<>","||"] -> ++ map (string . (:[])) "+-^*/%~&|?<>[]=,;()") > sqlWhitespace :: Dialect -> Parser Token > sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace) diff --git a/tools/Language/SQL/SimpleSQL/LexerTests.lhs b/tools/Language/SQL/SimpleSQL/LexerTests.lhs index 7e7e76b..c8a21ae 100644 --- a/tools/Language/SQL/SimpleSQL/LexerTests.lhs +++ b/tools/Language/SQL/SimpleSQL/LexerTests.lhs @@ -8,6 +8,7 @@ Test for the lexer > import Language.SQL.SimpleSQL.Lex (Token(..)) > --import Debug.Trace > import Data.Char (isAlpha) +> import Data.List > lexerTests :: TestItem > lexerTests = Group "lexerTests" $ @@ -78,6 +79,12 @@ csstring string line comment anything (can add newline?) number number (todo: double check more carefully) +new idea: + +create a function which will say if a series of lexical tokens should +survive the pretty -> lex cycle unchanged. Maybe this will lead to +clearer/more maintainable code. + > , isGood $ t ++ t1 > ] @@ -174,8 +181,28 @@ assurance. > map (\s -> ([s],[Symbol [s]])) "+-^*/%~&|?<>[]=,;():" > -- multi char symbols > ++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||", "::","..",":="] -> -- todo: add many examples of generic symbols -> -- also: do the testing for the ansi compatibility special cases +> -- generic symbols + +An operator name is a sequence of up to NAMEDATALEN-1 (63 by default) characters from the following list: + ++ - * / < > = ~ ! @ # % ^ & | ` ? + +There are a few restrictions on operator names, however: +-- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment. + +A multiple-character operator name cannot end in + or -, unless the name also contains at least one of these characters: + +~ ! @ # % ^ & | ` ? + +todo: 'negative' tests +symbol then -- +symbol then /* +operators without one of the exception chars + followed by + or - without whitespace + +also: do the testing for the ansi compatibility special cases + +> ++ [ (x, [Symbol x]) | x <- someValidPostgresOperators 4] > ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"] > -- simple identifiers > in map (\i -> (i, [Identifier Nothing i])) idens @@ -227,7 +254,52 @@ assurance. > ,"/* this *is/ a comment */" > ] +> someValidPostgresOperators :: Int -> [String] +> someValidPostgresOperators l = +> [ x +> | n <- [1..l] +> , x <- combos "+-*/<>=~!@#%^&|`?" n +> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x) +> , not (last x `elem` "+-") +> || or (map (`elem` x) "~!@#%^&|`?") +> ] + +These are postgres operators, which if followed immediately by a + or +-, will lex as separate operators rather than one operator including +the + or -. + +> somePostgresOpsWhichWontAddTrailingPlusMinus :: Int -> [String] +> somePostgresOpsWhichWontAddTrailingPlusMinus l = +> [ x +> | n <- [1..l] +> , x <- combos "+-*/<>=" n +> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x) +> , not (last x `elem` "+-") +> ] + + > postgresLexerTests :: TestItem > postgresLexerTests = Group "postgresLexerTests" $ -> [Group "postgres lexer token tests" $ [LexerTest postgres s t | (s,t) <- postgresLexerTable] +> [Group "postgres lexer token tests" $ +> [LexerTest postgres s t | (s,t) <- postgresLexerTable] +> ,Group "adhoc lexer tests" $ +> [LexerTest postgres s t +> | (s,t) <- edgeCaseCommentOps ++ edgeCasePlusMinusOps ] > ] +> where +> edgeCaseCommentOps = concat +> [ [ (x ++ "/* , (x ++ "-- | x <- someValidPostgresOperators 2 +> , not (last x == '-') +> ] +> edgeCasePlusMinusOps = concat +> [ [ (x ++ "+", [Symbol x, Symbol "+"]) +> , (x ++ "-", [Symbol x, Symbol "-"]) ] +> | x <- somePostgresOpsWhichWontAddTrailingPlusMinus 2 +> ] + + +> combos :: [a] -> Int -> [[a]] +> combos _ 0 = [[]] +> combos l n = [ x:tl | x <- l, tl <- combos l (n - 1) ]