work on postgres operator parsing
This commit is contained in:
parent
5084c0c3ab
commit
c24008444c
|
@ -54,7 +54,8 @@ todo: public documentation on dialect definition - and dialect flags
|
||||||
> ,setPosition,getPosition
|
> ,setPosition,getPosition
|
||||||
> ,setSourceColumn,setSourceLine
|
> ,setSourceColumn,setSourceLine
|
||||||
> ,sourceName, setSourceName
|
> ,sourceName, setSourceName
|
||||||
> ,sourceLine, sourceColumn)
|
> ,sourceLine, sourceColumn
|
||||||
|
> ,notFollowedBy)
|
||||||
> import Language.SQL.SimpleSQL.Combinators
|
> import Language.SQL.SimpleSQL.Combinators
|
||||||
> import Language.SQL.SimpleSQL.Errors
|
> import Language.SQL.SimpleSQL.Errors
|
||||||
> import Control.Applicative hiding ((<|>), many)
|
> import Control.Applicative hiding ((<|>), many)
|
||||||
|
@ -329,21 +330,101 @@ character symbols in the two lists below.
|
||||||
|
|
||||||
> symbol :: Dialect -> Parser Token
|
> symbol :: Dialect -> Parser Token
|
||||||
> symbol d | diSyntaxFlavour d == Postgres =
|
> symbol d | diSyntaxFlavour d == Postgres =
|
||||||
> Symbol <$>
|
> Symbol <$> choice (otherSymbol ++ [singlePlusMinus,opMoreChars])
|
||||||
> choice (
|
|
||||||
> many1 (char '.') :
|
rules
|
||||||
> map (try . string) [">=","<=","!=","<>","||", "::", ":="]
|
|
||||||
> ++ map (string . (:[])) "+-^*/%~&|?<>[]=,;():")
|
An operator name is a sequence of up to NAMEDATALEN-1 (63 by default) characters from the following list:
|
||||||
|
|
||||||
|
+ - * / < > = ~ ! @ # % ^ & | ` ?
|
||||||
|
|
||||||
|
There are a few restrictions on operator names, however:
|
||||||
|
-- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment.
|
||||||
|
|
||||||
|
A multiple-character operator name cannot end in + or -, unless the name also contains at least one of these characters:
|
||||||
|
|
||||||
|
~ ! @ # % ^ & | ` ?
|
||||||
|
|
||||||
|
> where
|
||||||
|
> -- other symbols are all the tokens which parse as symbols in
|
||||||
|
> -- this lexer which aren't considered operators in postgresql
|
||||||
|
> -- a single ? is parsed as a operator here instead of an other
|
||||||
|
> -- symbol because this is the least complex way to do it
|
||||||
|
> otherSymbol = many1 (char '.') :
|
||||||
|
> (map (try . string) ["::", ":="]
|
||||||
|
> ++ map (string . (:[])) "[],;():")
|
||||||
|
|
||||||
|
exception char is one of:
|
||||||
|
~ ! @ # % ^ & | ` ?
|
||||||
|
which allows the last character of a multi character symbol to be + or
|
||||||
|
-
|
||||||
|
|
||||||
|
> allOpSymbols = "+-*/<>=~!@#%^&|`?"
|
||||||
|
> -- all symbols except - and / which can be used to start
|
||||||
|
> -- a comment token
|
||||||
|
> allOpSymbolsNoCommentStarters = filter (`notElem` "-/") allOpSymbols
|
||||||
|
> -- these are the symbols when if part of a multi character
|
||||||
|
> -- operator permit the operator to end with a + or - symbol
|
||||||
|
> exceptionOpSymbols = "~!@#%^&|`?"
|
||||||
|
>
|
||||||
|
> -- special case for parsing a single + or - symbol
|
||||||
|
> singlePlusMinus = try $ do
|
||||||
|
> c <- choice $ map char "+-"
|
||||||
|
> -- todo: deal with e.g. --- +-- +/* ?
|
||||||
|
> notFollowedBy $ choice $ map char allOpSymbols
|
||||||
|
> return [c]
|
||||||
|
|
||||||
|
> -- this is used when we are parsing a potentially multi symbol
|
||||||
|
> -- operator and we have alread seen one of the 'exception chars'
|
||||||
|
> -- and so we can end with a + or -
|
||||||
|
> moreOpCharsException = do
|
||||||
|
> c <- choice (map char allOpSymbolsNoCommentStarters
|
||||||
|
> -- make sure we don't parse a comment starting token
|
||||||
|
> -- as part of an operator
|
||||||
|
> ++ [try (char '/' <* notFollowedBy (char '*'))
|
||||||
|
> ,try (char '-' <* notFollowedBy (char '-'))])
|
||||||
|
> (c:) <$> option [] moreOpCharsException
|
||||||
|
|
||||||
|
> opMoreChars = choice
|
||||||
|
> [do
|
||||||
|
> -- parse an exception char, now we can finish with a + -
|
||||||
|
> c <- choice $ map char exceptionOpSymbols
|
||||||
|
> (c:) <$> option [] moreOpCharsException
|
||||||
|
> ,do
|
||||||
|
> -- parse + or -, make sure it isn't the last symbol
|
||||||
|
> c <- try (char '+'
|
||||||
|
> -- make sure there is another symbol
|
||||||
|
> <* lookAhead (choice $ map char allOpSymbols))
|
||||||
|
> (c:) <$> option [] opMoreChars
|
||||||
|
> ,do
|
||||||
|
> c <- try (char '-'
|
||||||
|
> -- check for comment
|
||||||
|
> <* notFollowedBy (char '-')
|
||||||
|
> -- make sure there is another symbol
|
||||||
|
> <* lookAhead (choice $ map char allOpSymbols))
|
||||||
|
> (c:) <$> option [] opMoreChars
|
||||||
|
> ,do
|
||||||
|
> -- parse one of the other ansi operator symbols
|
||||||
|
> c <- choice (-- check / isn't start of comment /*
|
||||||
|
> try (char '/' <* notFollowedBy (char '*'))
|
||||||
|
> : map char "*<>=")
|
||||||
|
> (c:) <$> option [] opMoreChars
|
||||||
|
> ]
|
||||||
|
|
||||||
|
|
||||||
|
> symbol _ =
|
||||||
|
> Symbol <$> choice (otherSymbol ++ regularOp)
|
||||||
|
> where
|
||||||
|
> otherSymbol = many1 (char '.') :
|
||||||
|
> map (string . (:[])) "[],;():?"
|
||||||
|
|
||||||
|
try is used because most of the first characters of the two character
|
||||||
|
symbols can also be part of a single character symbol
|
||||||
|
|
||||||
|
> regularOp = map (try . string) [">=","<=","!=","<>","||"]
|
||||||
|
> ++ map (string . (:[])) "+-^*/%~&|<>="
|
||||||
|
|
||||||
|
|
||||||
> symbol _ = Symbol <$>
|
|
||||||
> choice (
|
|
||||||
> many1 (char '.') :
|
|
||||||
> -- try is used because most of the first
|
|
||||||
> -- characters of the two character symbols
|
|
||||||
> -- can also be part of a single character symbol
|
|
||||||
> -- maybe this would be better with left factoring?
|
|
||||||
> map (try . string) [">=","<=","!=","<>","||"]
|
|
||||||
> ++ map (string . (:[])) "+-^*/%~&|?<>[]=,;()")
|
|
||||||
|
|
||||||
> sqlWhitespace :: Dialect -> Parser Token
|
> sqlWhitespace :: Dialect -> Parser Token
|
||||||
> sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
|
> sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
|
||||||
|
|
|
@ -8,6 +8,7 @@ Test for the lexer
|
||||||
> import Language.SQL.SimpleSQL.Lex (Token(..))
|
> import Language.SQL.SimpleSQL.Lex (Token(..))
|
||||||
> --import Debug.Trace
|
> --import Debug.Trace
|
||||||
> import Data.Char (isAlpha)
|
> import Data.Char (isAlpha)
|
||||||
|
> import Data.List
|
||||||
|
|
||||||
> lexerTests :: TestItem
|
> lexerTests :: TestItem
|
||||||
> lexerTests = Group "lexerTests" $
|
> lexerTests = Group "lexerTests" $
|
||||||
|
@ -78,6 +79,12 @@ csstring string
|
||||||
line comment anything (can add newline?)
|
line comment anything (can add newline?)
|
||||||
number number (todo: double check more carefully)
|
number number (todo: double check more carefully)
|
||||||
|
|
||||||
|
new idea:
|
||||||
|
|
||||||
|
create a function which will say if a series of lexical tokens should
|
||||||
|
survive the pretty -> lex cycle unchanged. Maybe this will lead to
|
||||||
|
clearer/more maintainable code.
|
||||||
|
|
||||||
> , isGood $ t ++ t1
|
> , isGood $ t ++ t1
|
||||||
|
|
||||||
> ]
|
> ]
|
||||||
|
@ -174,8 +181,28 @@ assurance.
|
||||||
> map (\s -> ([s],[Symbol [s]])) "+-^*/%~&|?<>[]=,;():"
|
> map (\s -> ([s],[Symbol [s]])) "+-^*/%~&|?<>[]=,;():"
|
||||||
> -- multi char symbols
|
> -- multi char symbols
|
||||||
> ++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||", "::","..",":="]
|
> ++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||", "::","..",":="]
|
||||||
> -- todo: add many examples of generic symbols
|
> -- generic symbols
|
||||||
> -- also: do the testing for the ansi compatibility special cases
|
|
||||||
|
An operator name is a sequence of up to NAMEDATALEN-1 (63 by default) characters from the following list:
|
||||||
|
|
||||||
|
+ - * / < > = ~ ! @ # % ^ & | ` ?
|
||||||
|
|
||||||
|
There are a few restrictions on operator names, however:
|
||||||
|
-- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment.
|
||||||
|
|
||||||
|
A multiple-character operator name cannot end in + or -, unless the name also contains at least one of these characters:
|
||||||
|
|
||||||
|
~ ! @ # % ^ & | ` ?
|
||||||
|
|
||||||
|
todo: 'negative' tests
|
||||||
|
symbol then --
|
||||||
|
symbol then /*
|
||||||
|
operators without one of the exception chars
|
||||||
|
followed by + or - without whitespace
|
||||||
|
|
||||||
|
also: do the testing for the ansi compatibility special cases
|
||||||
|
|
||||||
|
> ++ [ (x, [Symbol x]) | x <- someValidPostgresOperators 4]
|
||||||
> ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"]
|
> ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"]
|
||||||
> -- simple identifiers
|
> -- simple identifiers
|
||||||
> in map (\i -> (i, [Identifier Nothing i])) idens
|
> in map (\i -> (i, [Identifier Nothing i])) idens
|
||||||
|
@ -227,7 +254,52 @@ assurance.
|
||||||
> ,"/* this *is/ a comment */"
|
> ,"/* this *is/ a comment */"
|
||||||
> ]
|
> ]
|
||||||
|
|
||||||
|
> someValidPostgresOperators :: Int -> [String]
|
||||||
|
> someValidPostgresOperators l =
|
||||||
|
> [ x
|
||||||
|
> | n <- [1..l]
|
||||||
|
> , x <- combos "+-*/<>=~!@#%^&|`?" n
|
||||||
|
> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x)
|
||||||
|
> , not (last x `elem` "+-")
|
||||||
|
> || or (map (`elem` x) "~!@#%^&|`?")
|
||||||
|
> ]
|
||||||
|
|
||||||
|
These are postgres operators, which if followed immediately by a + or
|
||||||
|
-, will lex as separate operators rather than one operator including
|
||||||
|
the + or -.
|
||||||
|
|
||||||
|
> somePostgresOpsWhichWontAddTrailingPlusMinus :: Int -> [String]
|
||||||
|
> somePostgresOpsWhichWontAddTrailingPlusMinus l =
|
||||||
|
> [ x
|
||||||
|
> | n <- [1..l]
|
||||||
|
> , x <- combos "+-*/<>=" n
|
||||||
|
> , not ("--" `isInfixOf` x || "/*" `isInfixOf` x)
|
||||||
|
> , not (last x `elem` "+-")
|
||||||
|
> ]
|
||||||
|
|
||||||
|
|
||||||
> postgresLexerTests :: TestItem
|
> postgresLexerTests :: TestItem
|
||||||
> postgresLexerTests = Group "postgresLexerTests" $
|
> postgresLexerTests = Group "postgresLexerTests" $
|
||||||
> [Group "postgres lexer token tests" $ [LexerTest postgres s t | (s,t) <- postgresLexerTable]
|
> [Group "postgres lexer token tests" $
|
||||||
|
> [LexerTest postgres s t | (s,t) <- postgresLexerTable]
|
||||||
|
> ,Group "adhoc lexer tests" $
|
||||||
|
> [LexerTest postgres s t
|
||||||
|
> | (s,t) <- edgeCaseCommentOps ++ edgeCasePlusMinusOps ]
|
||||||
> ]
|
> ]
|
||||||
|
> where
|
||||||
|
> edgeCaseCommentOps = concat
|
||||||
|
> [ [ (x ++ "/*<test*/", [Symbol x, BlockComment "/*<test*/"])
|
||||||
|
> , (x ++ "--<test", [Symbol x, LineComment "--<test"]) ]
|
||||||
|
> | x <- someValidPostgresOperators 2
|
||||||
|
> , not (last x == '-')
|
||||||
|
> ]
|
||||||
|
> edgeCasePlusMinusOps = concat
|
||||||
|
> [ [ (x ++ "+", [Symbol x, Symbol "+"])
|
||||||
|
> , (x ++ "-", [Symbol x, Symbol "-"]) ]
|
||||||
|
> | x <- somePostgresOpsWhichWontAddTrailingPlusMinus 2
|
||||||
|
> ]
|
||||||
|
|
||||||
|
|
||||||
|
> combos :: [a] -> Int -> [[a]]
|
||||||
|
> combos _ 0 = [[]]
|
||||||
|
> combos l n = [ x:tl | x <- l, tl <- combos l (n - 1) ]
|
||||||
|
|
Loading…
Reference in a new issue