From c24008444c8c2895c162c98a145afd8fa23b3fd6 Mon Sep 17 00:00:00 2001
From: Jake Wheat <jakewheatmail@gmail.com>
Date: Sat, 13 Feb 2016 20:28:12 +0200
Subject: [PATCH] work on postgres operator parsing

---
 Language/SQL/SimpleSQL/Lex.lhs              | 111 +++++++++++++++++---
 tools/Language/SQL/SimpleSQL/LexerTests.lhs |  78 +++++++++++++-
 2 files changed, 171 insertions(+), 18 deletions(-)

diff --git a/Language/SQL/SimpleSQL/Lex.lhs b/Language/SQL/SimpleSQL/Lex.lhs
index 238a889..3a2825a 100644
--- a/Language/SQL/SimpleSQL/Lex.lhs
+++ b/Language/SQL/SimpleSQL/Lex.lhs
@@ -54,7 +54,8 @@ todo: public documentation on dialect definition - and dialect flags
 >                    ,setPosition,getPosition
 >                    ,setSourceColumn,setSourceLine
 >                    ,sourceName, setSourceName
->                    ,sourceLine, sourceColumn)
+>                    ,sourceLine, sourceColumn
+>                    ,notFollowedBy)
 > import Language.SQL.SimpleSQL.Combinators
 > import Language.SQL.SimpleSQL.Errors
 > import Control.Applicative hiding ((<|>), many)
@@ -329,21 +330,101 @@ character symbols in the two lists below.
 
 > symbol :: Dialect -> Parser Token
 > symbol d | diSyntaxFlavour d == Postgres =
->     Symbol <$>
->     choice (
->             many1 (char '.') :
->                 map (try . string) [">=","<=","!=","<>","||", "::", ":="]
->                 ++ map (string . (:[])) "+-^*/%~&|?<>[]=,;():")
+>     Symbol <$> choice (otherSymbol ++ [singlePlusMinus,opMoreChars])
+
+rules
+
+An operator name is a sequence of up to NAMEDATALEN-1 (63 by default) characters from the following list:
+
++ - * / < > = ~ ! @ # % ^ & | ` ?
+
+There are a few restrictions on operator names, however:
+-- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment.
+
+A multiple-character operator name cannot end in + or -, unless the name also contains at least one of these characters:
+
+~ ! @ # % ^ & | ` ?
+
+>  where
+>    -- other symbols are all the tokens which parse as symbols in
+>    -- this lexer which aren't considered operators in postgresql
+>    -- a single ? is parsed as a operator here instead of an other
+>    -- symbol because this is the least complex way to do it
+>    otherSymbol = many1 (char '.') :
+>                  (map (try . string) ["::", ":="]
+>                   ++ map (string . (:[])) "[],;():")
+
+exception char is one of:
+~ ! @ # % ^ & | ` ?
+which allows the last character of a multi character symbol to be + or
+-
+
+>    allOpSymbols = "+-*/<>=~!@#%^&|`?"
+>    -- all symbols except - and / which can be used to start
+>    -- a comment token
+>    allOpSymbolsNoCommentStarters = filter (`notElem` "-/") allOpSymbols
+>    -- these are the symbols when if part of a multi character
+>    -- operator permit the operator to end with a + or - symbol
+>    exceptionOpSymbols = "~!@#%^&|`?"
+> 
+>    -- special case for parsing a single + or - symbol
+>    singlePlusMinus = try $ do
+>      c <- choice $ map char "+-"
+>      -- todo: deal with e.g. --- +-- +/* ?
+>      notFollowedBy $ choice $ map char allOpSymbols
+>      return [c]
+
+>    -- this is used when we are parsing a potentially multi symbol
+>    -- operator and we have alread seen one of the 'exception chars'
+>    -- and so we can end with a + or -
+>    moreOpCharsException = do
+>        c <- choice (map char allOpSymbolsNoCommentStarters
+>                     -- make sure we don't parse a comment starting token
+>                     -- as part of an operator
+>                     ++ [try (char '/' <* notFollowedBy (char '*'))
+>                        ,try (char '-' <* notFollowedBy (char '-'))])
+>        (c:) <$> option [] moreOpCharsException
+
+>    opMoreChars = choice
+>        [do
+>         -- parse an exception char, now we can finish with a + -
+>         c <- choice $ map char exceptionOpSymbols
+>         (c:) <$> option [] moreOpCharsException
+>        ,do
+>         -- parse + or -, make sure it isn't the last symbol
+>         c <- try (char '+'
+>                   -- make sure there is another symbol
+>                   <* lookAhead (choice $ map char allOpSymbols))
+>         (c:) <$> option [] opMoreChars
+>        ,do
+>         c <- try (char '-'
+>                   -- check for comment
+>                   <* notFollowedBy (char '-')
+>                   -- make sure there is another symbol
+>                   <* lookAhead (choice $ map char allOpSymbols))
+>         (c:) <$> option [] opMoreChars
+>        ,do
+>         -- parse one of the other ansi operator symbols
+>         c <- choice (-- check / isn't start of comment /*
+>                      try (char '/' <* notFollowedBy (char '*'))
+>                      : map char "*<>=")
+>         (c:) <$> option [] opMoreChars
+>        ]
+
+
+> symbol _ =
+>    Symbol <$> choice (otherSymbol ++ regularOp)
+>  where
+>    otherSymbol = many1 (char '.') :
+>                  map (string . (:[])) "[],;():?"
+
+try is used because most of the first characters of the two character
+symbols can also be part of a single character symbol
+
+>    regularOp = map (try . string) [">=","<=","!=","<>","||"]
+>                ++ map (string . (:[])) "+-^*/%~&|<>="
+
 
-> symbol _ = Symbol <$>
->     choice (
->             many1 (char '.') :
->                  -- try is used because most of the first
->                  -- characters of the two character symbols
->                  -- can also be part of a single character symbol
->                  -- maybe this would be better with left factoring?
->                 map (try . string) [">=","<=","!=","<>","||"]
->                 ++ map (string . (:[])) "+-^*/%~&|?<>[]=,;()")
 
 > sqlWhitespace :: Dialect -> Parser Token
 > sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
diff --git a/tools/Language/SQL/SimpleSQL/LexerTests.lhs b/tools/Language/SQL/SimpleSQL/LexerTests.lhs
index 7e7e76b..c8a21ae 100644
--- a/tools/Language/SQL/SimpleSQL/LexerTests.lhs
+++ b/tools/Language/SQL/SimpleSQL/LexerTests.lhs
@@ -8,6 +8,7 @@ Test for the lexer
 > import Language.SQL.SimpleSQL.Lex (Token(..))
 > --import Debug.Trace
 > import Data.Char (isAlpha)
+> import Data.List
 
 > lexerTests :: TestItem
 > lexerTests = Group "lexerTests" $
@@ -78,6 +79,12 @@ csstring string
 line comment anything (can add newline?)
 number number (todo: double check more carefully)
 
+new idea:
+
+create a function which will say if a series of lexical tokens should
+survive the pretty -> lex cycle unchanged. Maybe this will lead to
+clearer/more maintainable code.
+
 >     , isGood $ t ++ t1
 
 >     ]
@@ -174,8 +181,28 @@ assurance.
 >     map (\s -> ([s],[Symbol [s]])) "+-^*/%~&|?<>[]=,;():"
 >     -- multi char symbols
 >     ++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||", "::","..",":="]
->     -- todo: add many examples of generic symbols
->     -- also: do the testing for the ansi compatibility special cases
+>     -- generic symbols
+
+An operator name is a sequence of up to NAMEDATALEN-1 (63 by default) characters from the following list:
+
++ - * / < > = ~ ! @ # % ^ & | ` ?
+
+There are a few restrictions on operator names, however:
+-- and /* cannot appear anywhere in an operator name, since they will be taken as the start of a comment.
+
+A multiple-character operator name cannot end in + or -, unless the name also contains at least one of these characters:
+
+~ ! @ # % ^ & | ` ?
+
+todo: 'negative' tests
+symbol then --
+symbol then /*
+operators without one of the exception chars
+  followed by + or - without whitespace
+
+also: do the testing for the ansi compatibility special cases
+
+>     ++ [ (x, [Symbol x]) | x <- someValidPostgresOperators 4]
 >     ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"]
 >         -- simple identifiers
 >         in map (\i -> (i, [Identifier Nothing i])) idens
@@ -227,7 +254,52 @@ assurance.
 >        ,"/* this *is/ a comment */"
 >        ]
 
+> someValidPostgresOperators :: Int -> [String]
+> someValidPostgresOperators l =
+>        [ x
+>        | n <- [1..l]
+>        , x <- combos "+-*/<>=~!@#%^&|`?" n
+>        , not ("--" `isInfixOf` x || "/*" `isInfixOf` x)
+>        , not (last x `elem` "+-")
+>          || or (map (`elem` x) "~!@#%^&|`?")
+>        ]
+
+These are postgres operators, which if followed immediately by a + or
+-, will lex as separate operators rather than one operator including
+the + or -.
+
+> somePostgresOpsWhichWontAddTrailingPlusMinus :: Int -> [String]
+> somePostgresOpsWhichWontAddTrailingPlusMinus l =
+>        [ x
+>        | n <- [1..l]
+>        , x <- combos "+-*/<>=" n
+>        , not ("--" `isInfixOf` x || "/*" `isInfixOf` x)
+>        , not (last x `elem` "+-")
+>        ]
+
+
 > postgresLexerTests :: TestItem
 > postgresLexerTests = Group "postgresLexerTests" $
->     [Group "postgres lexer token tests" $ [LexerTest postgres s t |  (s,t) <- postgresLexerTable]
+>     [Group "postgres lexer token tests" $
+>      [LexerTest postgres s t | (s,t) <- postgresLexerTable]
+>     ,Group "adhoc lexer tests" $
+>      [LexerTest postgres s t
+>      | (s,t) <- edgeCaseCommentOps ++ edgeCasePlusMinusOps ]
 >     ]
+>  where
+>    edgeCaseCommentOps = concat
+>      [ [ (x ++ "/*<test*/", [Symbol x, BlockComment "/*<test*/"])
+>        , (x ++ "--<test", [Symbol x, LineComment "--<test"]) ]
+>      | x <- someValidPostgresOperators 2
+>      , not (last x == '-')
+>      ]
+>    edgeCasePlusMinusOps = concat
+>      [ [ (x ++ "+", [Symbol x, Symbol "+"])
+>        , (x ++ "-", [Symbol x, Symbol "-"]) ]
+>      | x <- somePostgresOpsWhichWontAddTrailingPlusMinus 2
+>      ]
+
+
+> combos :: [a] -> Int -> [[a]]
+> combos _ 0 = [[]]
+> combos l n = [ x:tl | x <- l, tl <- combos l (n - 1) ]