refactor some of the multi keyword/ multiword typename parsing

2014-04-18 14:50:54 +03:00 · 2014-04-18 14:50:54 +03:00 · b0f1e044b4
parent 247c7a26b7
commit b0f1e044b4
4 changed files with 99 additions and 54 deletions
--- a/Language/SQL/SimpleSQL/Parser.lhs
+++ b/Language/SQL/SimpleSQL/Parser.lhs
@ -22,8 +22,8 @@
 > import qualified Text.Parsec as P (ParseError)
 > import Text.Parsec.Perm (permute,(<$?>), (<|?>))
 > import qualified Text.Parsec.Expr as E
-> import Data.List (intercalate)
+> import Data.List (intercalate,sort,groupBy)
-
+> import Data.Function (on)
 > import Language.SQL.SimpleSQL.Syntax
 The public API functions.
@ -513,15 +513,13 @@ typename: used in casts. Special cases for the multi keyword typenames
 that SQL supports.
 > typeName :: Parser TypeName
-> typeName = (choice (multiWordParsers
+> typeName =
->                    ++ [TypeName <$> identifierBlacklist blacklist])
+>   (choice [multiWordParsers
->             >>= optionSuffix precision
+>           ,TypeName <$> identifierBlacklist blacklist]
->            ) <?> "typename"
+>    >>= optionSuffix precision
 >   ) <?> "typename"
 >   where
->     multiWordParsers =
+>     multiWordParsers = (TypeName . unwords) <$> makeKeywordTree
 >         flip map multiWordTypeNames
 >         $ \ks -> (TypeName . unwords) <$> try (keywords ks)
 >     multiWordTypeNames = map words
 >         ["double precision"
 >         ,"character varying"
 >         ,"char varying"
@ -559,6 +557,41 @@ todo: timestamp types:
 >     ctor [a] = Parens a
 >     ctor as = SpecialOp [Name "rowctor"] as
 == multi keyword helper
 This helper is to help parsing multiple options of multiple keywords
 with similar prefixes, e.g. parsing 'is null' and 'is not null'.
 use to left factor/ improve:
 typed literal and general identifiers
 not like, not in, not between operators
 help with factoring keyword functions and other app-likes
 the join keyword sequences
 fetch first/next
 row/rows only
 There is probably a simpler way of doing this but I am a bit
 thick.
 > makeKeywordTree :: [String] -> Parser [String]
 > makeKeywordTree sets = do
 >     reverse <$> parseTrees (sort $ map words sets)
 >     --  ?? <?> intercalate "," sets
 >   where
 >     parseTrees :: [[String]] -> Parser [String]
 >     parseTrees ws = do
 >       let gs :: [[[String]]]
 >           gs = groupBy ((==) `on` head) ws
 >       choice $ map parseGroup gs
 >     parseGroup :: [[String]] -> Parser [String]
 >     parseGroup l = do
 >         let k = head $ head l
 >         keyword_ k
 >         let tls = map tail l
 >             pr = (k:) <$> parseTrees tls
 >         if (or $ map null tls)
 >           then pr <|> return [k]
 >           else pr
 == operator parsing
@ -569,6 +602,12 @@ can be symbols (a + b), single keywords (a and b) or multiple keywords
 TODO: carefully review the precedences and associativities.
 TODO: to fix the parsing completely, I think will need to parse
 without precedence and associativity and fix up afterwards, since SQL
 syntax is way too messy. It might be possible to avoid this if we
 wanted to avoid extensibility and to not be concerned with parse error
 messages, but both of these are considered too important.
 > opTable :: Bool -> [[E.Operator String () Identity ValueExpr]]
 > opTable bExpr =
 >         [-- parse match and quantified comparisons as postfix ops
@ -598,21 +637,22 @@ TODO: carefully review the precedences and associativities.
 >          ,binarySym "|" E.AssocRight
 >          ,binaryKeyword "like" E.AssocNone
 >          ,binaryKeyword "overlaps" E.AssocNone]
->          ++ map (`binaryKeywords` E.AssocNone)
+>          ++ [binaryKeywords $ makeKeywordTree
->          ["not like"
+>              ["not like"
->          ,"is similar to"
+>              ,"is similar to"
->          ,"is not similar to"
+>              ,"is not similar to"
->          ,"is distinct from"
+>              ,"is distinct from"
->          ,"is not distinct from"]
+>              ,"is not distinct from"]
->          ++ map postfixKeywords
+>             ,postfixKeywords $ makeKeywordTree
->          ["is null"
+>              ["is null"
->          ,"is not null"
+>              ,"is not null"
->          ,"is true"
+>              ,"is true"
->          ,"is not true"
+>              ,"is not true"
->          ,"is false"
+>              ,"is false"
->          ,"is not false"
+>              ,"is not false"
->          ,"is unknown"
+>              ,"is unknown"
->          ,"is not unknown"]
+>              ,"is not unknown"]
 >             ]
 >          -- have to use try with inSuffix because of a conflict
 >          -- with 'in' in position function, and not between
 >          -- between also has a try in it to deal with 'not'
@ -631,17 +671,20 @@ TODO: carefully review the precedences and associativities.
 >   where
 >     binarySym nm assoc = binary (symbol_ nm) nm assoc
 >     binaryKeyword nm assoc = binary (keyword_ nm) nm assoc
->     -- use try with the multi keywords because of shared
+>     binaryKeywords p =
->     -- prefixes to the sets of keywords. Should left factor
+>         E.Infix (do
->     -- somehow
+>             o <- p
->     binaryKeywords nm assoc = binary (try $ keywords_ (words nm)) nm assoc
+>             return (\a b -> BinOp a [Name $ unwords o] b))
 >             E.AssocNone
 >     postfixKeywords p =
 >       postfix' $ do
 >           o <- p
 >           return $ PostfixOp [Name $ unwords o]
 >     binary p nm assoc =
 >       E.Infix (p >> return (\a b -> BinOp a [Name nm] b)) assoc
 >     prefixKeyword nm = prefix (keyword_ nm) nm
 >     prefixSym nm = prefix (symbol_ nm) nm
 >     prefix p nm = prefix' (p >> return (PrefixOp [Name nm]))
 >     postfixKeywords nm = postfix (try $ keywords_ (words nm)) nm
 >     postfix p nm = postfix' (p >> return (PostfixOp [Name nm]))
 >     -- hack from here
 >     -- http://stackoverflow.com/questions/10475337/parsec-expr-repeated-prefix-postfix-operator-not-supported
 >     -- not implemented properly yet
@ -815,11 +858,9 @@ allows offset and fetch in either order
 > fetch :: Parser ValueExpr
 > fetch = choice [ansiFetch, limit]
 >   where --todo: better left factoring
->     ansiFetch = keyword_ "fetch" >>
+>     fs = makeKeywordTree ["fetch first", "fetch next"]
->         choice [keyword_ "first",keyword_ "next"]
+>     ro = makeKeywordTree ["rows only", "row only"]
->         *> valueExpr
+>     ansiFetch = fs *> valueExpr <* ro
 >         <* choice [keyword_ "rows",keyword_ "row"]
 >         <* keyword_ "only"
 >     limit = keyword_ "limit" *> valueExpr
 == common table expressions
@ -1070,9 +1111,6 @@ todo: work out the symbol parsing better
 helper function to improve error messages
 > keywords :: [String] -> Parser [String]
 > keywords ks = mapM keyword ks <?> intercalate " " ks
 > keywords_ :: [String] -> Parser ()
 > keywords_ ks = mapM_ keyword_ ks <?> intercalate " " ks
--- a/Language/SQL/SimpleSQL/Syntax.lhs
+++ b/Language/SQL/SimpleSQL/Syntax.lhs
@ -139,7 +139,7 @@
 >                                   -- e.g. a[3]. The first
 >                                   -- valueExpr is the array, the
 >                                   -- second is the subscripts/ctor args
->     | ArrayCtor QueryExpr -- ^ this is used for the query expression version of array constructors, e.g. array(select * from t)>       deriving (Eq,Show,Read,Data,Typeable)
+>     | ArrayCtor QueryExpr -- ^ this is used for the query expression version of array constructors, e.g. array(select * from t)
 >     | CSStringLit String String
 >     | Escape ValueExpr Char
 >     | UEscape ValueExpr Char
--- a/32
+++ b/32
@ -14,7 +14,8 @@ represent missing optional bits in the ast as nothing instead of the
 look at fixing the expression parsing completely
 represent natural and using/on in the syntax more close to the
   concrete syntax - don't combine in the ast
-
+review haddock in the syntax and update
 review syntax names and representation
 careful review of token parses wrt trailing delimiters/junk
 review places in the parse which should allow only a fixed set of
@ -26,10 +27,12 @@ decide whether to represent numeric literals better, instead of a
 rough SQL 2003 todo, including tests to write:
 switch TypedLit to CSStringLit based on first char being underscore?
 idens: "", unicode, charset?, check dotted idens and contexts
 add missing type name support: lots of missing ones here, including
   simple stuff like lob variations, and new things like interval,
   row, ref, scope, array, multiset type names.
  add tests for all the typenames cast and typed literal
 date and time literals
 multisets
 review window functions, window clause
@ -43,13 +46,15 @@ filter in aggs
 within group in aggs
 rows review
-support needed MODULE syntax in identifiers - already covered?
+LNR: maybe leave until after next release
-decide how to represent special identifiers including the session
+
 LNR support needed MODULE syntax in identifiers - already covered?
 LNR decide how to represent special identifiers including the session
   variables or whatever they are called like current_user
-grouping - needs special syntax?
+LNR grouping - needs special syntax?
-next value for
+LNR next value for
-probably leave for now: subtypes, methods, new /routine, dereference
+LNR subtypes, methods, new /routine, dereference
-double check associativity, precedence (value exprs, joins, set ops)
+LNR double check associativity, precedence (value exprs, joins, set ops)
 position expressions
 length expressions
 extract expression
@ -61,7 +66,7 @@ convert
 translate
 trim
 overlay
-specifictype
+LNR specifictype
 datetime value expressions
 intervals
 row value constructors, expressions review
@ -80,21 +85,24 @@ between symmetric/asymmetric
 in predicate review
 escape for like
 escape for [not] similar to
-regular expression syntax?
+LNR regular expression syntax?
 normalized predicate
 overlaps predicate
 distinct from predicate
 member predicate
 submultiset predicate
 set predicate
-type predicate
+LNR type predicate
 additional stuff review:
 interval stuff
 aggregate functions: lots of missing bits
  especially: filter where, within group
 complete list of keywords/reserved keywords
-select into
+LNR select into
-other language format identifiers for host params?
+LNR other language format identifiers for host params?
 ----
 above not marked LNR are for next release
 review areas where this parser is too permissive, e.g. value
   expressions allowed where column reference names only should be
--- a/tools/Language/SQL/SimpleSQL/ValueExprs.lhs
+++ b/tools/Language/SQL/SimpleSQL/ValueExprs.lhs
@ -132,8 +132,7 @@ Tests for parsing value expressions
 > unaryOperators :: TestItem
 > unaryOperators = Group "unaryOperators" $ map (uncurry TestValueExpr)
 >     [("not a", PrefixOp "not" $ Iden "a")
->      -- I think this is a missing feature or bug in parsec buildExpressionParser
+>     ,("not not a", PrefixOp "not" $ PrefixOp "not" $ Iden "a")
 >     --,("not not a", PrefixOp "not" $ PrefixOp "not" $ Iden "a")
 >     ,("+a", PrefixOp "+" $ Iden "a")
 >     ,("-a", PrefixOp "-" $ Iden "a")
 >     ]