From b0f1e044b48fbc85122cd89f41dc92748569ac2b Mon Sep 17 00:00:00 2001 From: Jake Wheat Date: Fri, 18 Apr 2014 14:50:54 +0300 Subject: [PATCH] refactor some of the multi keyword/ multiword typename parsing --- Language/SQL/SimpleSQL/Parser.lhs | 116 +++++++++++++------- Language/SQL/SimpleSQL/Syntax.lhs | 2 +- TODO | 32 ++++-- tools/Language/SQL/SimpleSQL/ValueExprs.lhs | 3 +- 4 files changed, 99 insertions(+), 54 deletions(-) diff --git a/Language/SQL/SimpleSQL/Parser.lhs b/Language/SQL/SimpleSQL/Parser.lhs index 2eef067..377c660 100644 --- a/Language/SQL/SimpleSQL/Parser.lhs +++ b/Language/SQL/SimpleSQL/Parser.lhs @@ -22,8 +22,8 @@ > import qualified Text.Parsec as P (ParseError) > import Text.Parsec.Perm (permute,(<$?>), (<|?>)) > import qualified Text.Parsec.Expr as E -> import Data.List (intercalate) - +> import Data.List (intercalate,sort,groupBy) +> import Data.Function (on) > import Language.SQL.SimpleSQL.Syntax The public API functions. @@ -513,15 +513,13 @@ typename: used in casts. Special cases for the multi keyword typenames that SQL supports. > typeName :: Parser TypeName -> typeName = (choice (multiWordParsers -> ++ [TypeName <$> identifierBlacklist blacklist]) -> >>= optionSuffix precision -> ) "typename" +> typeName = +> (choice [multiWordParsers +> ,TypeName <$> identifierBlacklist blacklist] +> >>= optionSuffix precision +> ) "typename" > where -> multiWordParsers = -> flip map multiWordTypeNames -> $ \ks -> (TypeName . unwords) <$> try (keywords ks) -> multiWordTypeNames = map words +> multiWordParsers = (TypeName . unwords) <$> makeKeywordTree > ["double precision" > ,"character varying" > ,"char varying" @@ -559,6 +557,41 @@ todo: timestamp types: > ctor [a] = Parens a > ctor as = SpecialOp [Name "rowctor"] as +== multi keyword helper + +This helper is to help parsing multiple options of multiple keywords +with similar prefixes, e.g. parsing 'is null' and 'is not null'. + +use to left factor/ improve: +typed literal and general identifiers +not like, not in, not between operators +help with factoring keyword functions and other app-likes +the join keyword sequences +fetch first/next +row/rows only + +There is probably a simpler way of doing this but I am a bit +thick. + +> makeKeywordTree :: [String] -> Parser [String] +> makeKeywordTree sets = do +> reverse <$> parseTrees (sort $ map words sets) +> -- ?? intercalate "," sets +> where +> parseTrees :: [[String]] -> Parser [String] +> parseTrees ws = do +> let gs :: [[[String]]] +> gs = groupBy ((==) `on` head) ws +> choice $ map parseGroup gs +> parseGroup :: [[String]] -> Parser [String] +> parseGroup l = do +> let k = head $ head l +> keyword_ k +> let tls = map tail l +> pr = (k:) <$> parseTrees tls +> if (or $ map null tls) +> then pr <|> return [k] +> else pr == operator parsing @@ -569,6 +602,12 @@ can be symbols (a + b), single keywords (a and b) or multiple keywords TODO: carefully review the precedences and associativities. +TODO: to fix the parsing completely, I think will need to parse +without precedence and associativity and fix up afterwards, since SQL +syntax is way too messy. It might be possible to avoid this if we +wanted to avoid extensibility and to not be concerned with parse error +messages, but both of these are considered too important. + > opTable :: Bool -> [[E.Operator String () Identity ValueExpr]] > opTable bExpr = > [-- parse match and quantified comparisons as postfix ops @@ -598,21 +637,22 @@ TODO: carefully review the precedences and associativities. > ,binarySym "|" E.AssocRight > ,binaryKeyword "like" E.AssocNone > ,binaryKeyword "overlaps" E.AssocNone] -> ++ map (`binaryKeywords` E.AssocNone) -> ["not like" -> ,"is similar to" -> ,"is not similar to" -> ,"is distinct from" -> ,"is not distinct from"] -> ++ map postfixKeywords -> ["is null" -> ,"is not null" -> ,"is true" -> ,"is not true" -> ,"is false" -> ,"is not false" -> ,"is unknown" -> ,"is not unknown"] +> ++ [binaryKeywords $ makeKeywordTree +> ["not like" +> ,"is similar to" +> ,"is not similar to" +> ,"is distinct from" +> ,"is not distinct from"] +> ,postfixKeywords $ makeKeywordTree +> ["is null" +> ,"is not null" +> ,"is true" +> ,"is not true" +> ,"is false" +> ,"is not false" +> ,"is unknown" +> ,"is not unknown"] +> ] > -- have to use try with inSuffix because of a conflict > -- with 'in' in position function, and not between > -- between also has a try in it to deal with 'not' @@ -631,17 +671,20 @@ TODO: carefully review the precedences and associativities. > where > binarySym nm assoc = binary (symbol_ nm) nm assoc > binaryKeyword nm assoc = binary (keyword_ nm) nm assoc -> -- use try with the multi keywords because of shared -> -- prefixes to the sets of keywords. Should left factor -> -- somehow -> binaryKeywords nm assoc = binary (try $ keywords_ (words nm)) nm assoc +> binaryKeywords p = +> E.Infix (do +> o <- p +> return (\a b -> BinOp a [Name $ unwords o] b)) +> E.AssocNone +> postfixKeywords p = +> postfix' $ do +> o <- p +> return $ PostfixOp [Name $ unwords o] > binary p nm assoc = > E.Infix (p >> return (\a b -> BinOp a [Name nm] b)) assoc > prefixKeyword nm = prefix (keyword_ nm) nm > prefixSym nm = prefix (symbol_ nm) nm > prefix p nm = prefix' (p >> return (PrefixOp [Name nm])) -> postfixKeywords nm = postfix (try $ keywords_ (words nm)) nm -> postfix p nm = postfix' (p >> return (PostfixOp [Name nm])) > -- hack from here > -- http://stackoverflow.com/questions/10475337/parsec-expr-repeated-prefix-postfix-operator-not-supported > -- not implemented properly yet @@ -815,11 +858,9 @@ allows offset and fetch in either order > fetch :: Parser ValueExpr > fetch = choice [ansiFetch, limit] > where --todo: better left factoring -> ansiFetch = keyword_ "fetch" >> -> choice [keyword_ "first",keyword_ "next"] -> *> valueExpr -> <* choice [keyword_ "rows",keyword_ "row"] -> <* keyword_ "only" +> fs = makeKeywordTree ["fetch first", "fetch next"] +> ro = makeKeywordTree ["rows only", "row only"] +> ansiFetch = fs *> valueExpr <* ro > limit = keyword_ "limit" *> valueExpr == common table expressions @@ -1070,9 +1111,6 @@ todo: work out the symbol parsing better helper function to improve error messages -> keywords :: [String] -> Parser [String] -> keywords ks = mapM keyword ks intercalate " " ks - > keywords_ :: [String] -> Parser () > keywords_ ks = mapM_ keyword_ ks intercalate " " ks diff --git a/Language/SQL/SimpleSQL/Syntax.lhs b/Language/SQL/SimpleSQL/Syntax.lhs index 1c26f3a..e035a01 100644 --- a/Language/SQL/SimpleSQL/Syntax.lhs +++ b/Language/SQL/SimpleSQL/Syntax.lhs @@ -139,7 +139,7 @@ > -- e.g. a[3]. The first > -- valueExpr is the array, the > -- second is the subscripts/ctor args -> | ArrayCtor QueryExpr -- ^ this is used for the query expression version of array constructors, e.g. array(select * from t)> deriving (Eq,Show,Read,Data,Typeable) +> | ArrayCtor QueryExpr -- ^ this is used for the query expression version of array constructors, e.g. array(select * from t) > | CSStringLit String String > | Escape ValueExpr Char > | UEscape ValueExpr Char diff --git a/TODO b/TODO index d62ffbb..6f5e86d 100644 --- a/TODO +++ b/TODO @@ -14,7 +14,8 @@ represent missing optional bits in the ast as nothing instead of the look at fixing the expression parsing completely represent natural and using/on in the syntax more close to the concrete syntax - don't combine in the ast - +review haddock in the syntax and update +review syntax names and representation careful review of token parses wrt trailing delimiters/junk review places in the parse which should allow only a fixed set of @@ -26,10 +27,12 @@ decide whether to represent numeric literals better, instead of a rough SQL 2003 todo, including tests to write: +switch TypedLit to CSStringLit based on first char being underscore? idens: "", unicode, charset?, check dotted idens and contexts add missing type name support: lots of missing ones here, including simple stuff like lob variations, and new things like interval, row, ref, scope, array, multiset type names. + add tests for all the typenames cast and typed literal date and time literals multisets review window functions, window clause @@ -43,13 +46,15 @@ filter in aggs within group in aggs rows review -support needed MODULE syntax in identifiers - already covered? -decide how to represent special identifiers including the session +LNR: maybe leave until after next release + +LNR support needed MODULE syntax in identifiers - already covered? +LNR decide how to represent special identifiers including the session variables or whatever they are called like current_user -grouping - needs special syntax? -next value for -probably leave for now: subtypes, methods, new /routine, dereference -double check associativity, precedence (value exprs, joins, set ops) +LNR grouping - needs special syntax? +LNR next value for +LNR subtypes, methods, new /routine, dereference +LNR double check associativity, precedence (value exprs, joins, set ops) position expressions length expressions extract expression @@ -61,7 +66,7 @@ convert translate trim overlay -specifictype +LNR specifictype datetime value expressions intervals row value constructors, expressions review @@ -80,21 +85,24 @@ between symmetric/asymmetric in predicate review escape for like escape for [not] similar to -regular expression syntax? +LNR regular expression syntax? normalized predicate overlaps predicate distinct from predicate member predicate submultiset predicate set predicate -type predicate +LNR type predicate additional stuff review: interval stuff aggregate functions: lots of missing bits especially: filter where, within group complete list of keywords/reserved keywords -select into -other language format identifiers for host params? +LNR select into +LNR other language format identifiers for host params? + +---- +above not marked LNR are for next release review areas where this parser is too permissive, e.g. value expressions allowed where column reference names only should be diff --git a/tools/Language/SQL/SimpleSQL/ValueExprs.lhs b/tools/Language/SQL/SimpleSQL/ValueExprs.lhs index f974e1c..bd2feec 100644 --- a/tools/Language/SQL/SimpleSQL/ValueExprs.lhs +++ b/tools/Language/SQL/SimpleSQL/ValueExprs.lhs @@ -132,8 +132,7 @@ Tests for parsing value expressions > unaryOperators :: TestItem > unaryOperators = Group "unaryOperators" $ map (uncurry TestValueExpr) > [("not a", PrefixOp "not" $ Iden "a") -> -- I think this is a missing feature or bug in parsec buildExpressionParser -> --,("not not a", PrefixOp "not" $ PrefixOp "not" $ Iden "a") +> ,("not not a", PrefixOp "not" $ PrefixOp "not" $ Iden "a") > ,("+a", PrefixOp "+" $ Iden "a") > ,("-a", PrefixOp "-" $ Iden "a") > ]