1
Fork 0

refactor some of the multi keyword/ multiword typename parsing

This commit is contained in:
Jake Wheat 2014-04-18 14:50:54 +03:00
parent 247c7a26b7
commit b0f1e044b4
4 changed files with 99 additions and 54 deletions

View file

@ -22,8 +22,8 @@
> import qualified Text.Parsec as P (ParseError) > import qualified Text.Parsec as P (ParseError)
> import Text.Parsec.Perm (permute,(<$?>), (<|?>)) > import Text.Parsec.Perm (permute,(<$?>), (<|?>))
> import qualified Text.Parsec.Expr as E > import qualified Text.Parsec.Expr as E
> import Data.List (intercalate) > import Data.List (intercalate,sort,groupBy)
> import Data.Function (on)
> import Language.SQL.SimpleSQL.Syntax > import Language.SQL.SimpleSQL.Syntax
The public API functions. The public API functions.
@ -513,15 +513,13 @@ typename: used in casts. Special cases for the multi keyword typenames
that SQL supports. that SQL supports.
> typeName :: Parser TypeName > typeName :: Parser TypeName
> typeName = (choice (multiWordParsers > typeName =
> ++ [TypeName <$> identifierBlacklist blacklist]) > (choice [multiWordParsers
> >>= optionSuffix precision > ,TypeName <$> identifierBlacklist blacklist]
> ) <?> "typename" > >>= optionSuffix precision
> ) <?> "typename"
> where > where
> multiWordParsers = > multiWordParsers = (TypeName . unwords) <$> makeKeywordTree
> flip map multiWordTypeNames
> $ \ks -> (TypeName . unwords) <$> try (keywords ks)
> multiWordTypeNames = map words
> ["double precision" > ["double precision"
> ,"character varying" > ,"character varying"
> ,"char varying" > ,"char varying"
@ -559,6 +557,41 @@ todo: timestamp types:
> ctor [a] = Parens a > ctor [a] = Parens a
> ctor as = SpecialOp [Name "rowctor"] as > ctor as = SpecialOp [Name "rowctor"] as
== multi keyword helper
This helper is to help parsing multiple options of multiple keywords
with similar prefixes, e.g. parsing 'is null' and 'is not null'.
use to left factor/ improve:
typed literal and general identifiers
not like, not in, not between operators
help with factoring keyword functions and other app-likes
the join keyword sequences
fetch first/next
row/rows only
There is probably a simpler way of doing this but I am a bit
thick.
> makeKeywordTree :: [String] -> Parser [String]
> makeKeywordTree sets = do
> reverse <$> parseTrees (sort $ map words sets)
> -- ?? <?> intercalate "," sets
> where
> parseTrees :: [[String]] -> Parser [String]
> parseTrees ws = do
> let gs :: [[[String]]]
> gs = groupBy ((==) `on` head) ws
> choice $ map parseGroup gs
> parseGroup :: [[String]] -> Parser [String]
> parseGroup l = do
> let k = head $ head l
> keyword_ k
> let tls = map tail l
> pr = (k:) <$> parseTrees tls
> if (or $ map null tls)
> then pr <|> return [k]
> else pr
== operator parsing == operator parsing
@ -569,6 +602,12 @@ can be symbols (a + b), single keywords (a and b) or multiple keywords
TODO: carefully review the precedences and associativities. TODO: carefully review the precedences and associativities.
TODO: to fix the parsing completely, I think will need to parse
without precedence and associativity and fix up afterwards, since SQL
syntax is way too messy. It might be possible to avoid this if we
wanted to avoid extensibility and to not be concerned with parse error
messages, but both of these are considered too important.
> opTable :: Bool -> [[E.Operator String () Identity ValueExpr]] > opTable :: Bool -> [[E.Operator String () Identity ValueExpr]]
> opTable bExpr = > opTable bExpr =
> [-- parse match and quantified comparisons as postfix ops > [-- parse match and quantified comparisons as postfix ops
@ -598,21 +637,22 @@ TODO: carefully review the precedences and associativities.
> ,binarySym "|" E.AssocRight > ,binarySym "|" E.AssocRight
> ,binaryKeyword "like" E.AssocNone > ,binaryKeyword "like" E.AssocNone
> ,binaryKeyword "overlaps" E.AssocNone] > ,binaryKeyword "overlaps" E.AssocNone]
> ++ map (`binaryKeywords` E.AssocNone) > ++ [binaryKeywords $ makeKeywordTree
> ["not like" > ["not like"
> ,"is similar to" > ,"is similar to"
> ,"is not similar to" > ,"is not similar to"
> ,"is distinct from" > ,"is distinct from"
> ,"is not distinct from"] > ,"is not distinct from"]
> ++ map postfixKeywords > ,postfixKeywords $ makeKeywordTree
> ["is null" > ["is null"
> ,"is not null" > ,"is not null"
> ,"is true" > ,"is true"
> ,"is not true" > ,"is not true"
> ,"is false" > ,"is false"
> ,"is not false" > ,"is not false"
> ,"is unknown" > ,"is unknown"
> ,"is not unknown"] > ,"is not unknown"]
> ]
> -- have to use try with inSuffix because of a conflict > -- have to use try with inSuffix because of a conflict
> -- with 'in' in position function, and not between > -- with 'in' in position function, and not between
> -- between also has a try in it to deal with 'not' > -- between also has a try in it to deal with 'not'
@ -631,17 +671,20 @@ TODO: carefully review the precedences and associativities.
> where > where
> binarySym nm assoc = binary (symbol_ nm) nm assoc > binarySym nm assoc = binary (symbol_ nm) nm assoc
> binaryKeyword nm assoc = binary (keyword_ nm) nm assoc > binaryKeyword nm assoc = binary (keyword_ nm) nm assoc
> -- use try with the multi keywords because of shared > binaryKeywords p =
> -- prefixes to the sets of keywords. Should left factor > E.Infix (do
> -- somehow > o <- p
> binaryKeywords nm assoc = binary (try $ keywords_ (words nm)) nm assoc > return (\a b -> BinOp a [Name $ unwords o] b))
> E.AssocNone
> postfixKeywords p =
> postfix' $ do
> o <- p
> return $ PostfixOp [Name $ unwords o]
> binary p nm assoc = > binary p nm assoc =
> E.Infix (p >> return (\a b -> BinOp a [Name nm] b)) assoc > E.Infix (p >> return (\a b -> BinOp a [Name nm] b)) assoc
> prefixKeyword nm = prefix (keyword_ nm) nm > prefixKeyword nm = prefix (keyword_ nm) nm
> prefixSym nm = prefix (symbol_ nm) nm > prefixSym nm = prefix (symbol_ nm) nm
> prefix p nm = prefix' (p >> return (PrefixOp [Name nm])) > prefix p nm = prefix' (p >> return (PrefixOp [Name nm]))
> postfixKeywords nm = postfix (try $ keywords_ (words nm)) nm
> postfix p nm = postfix' (p >> return (PostfixOp [Name nm]))
> -- hack from here > -- hack from here
> -- http://stackoverflow.com/questions/10475337/parsec-expr-repeated-prefix-postfix-operator-not-supported > -- http://stackoverflow.com/questions/10475337/parsec-expr-repeated-prefix-postfix-operator-not-supported
> -- not implemented properly yet > -- not implemented properly yet
@ -815,11 +858,9 @@ allows offset and fetch in either order
> fetch :: Parser ValueExpr > fetch :: Parser ValueExpr
> fetch = choice [ansiFetch, limit] > fetch = choice [ansiFetch, limit]
> where --todo: better left factoring > where --todo: better left factoring
> ansiFetch = keyword_ "fetch" >> > fs = makeKeywordTree ["fetch first", "fetch next"]
> choice [keyword_ "first",keyword_ "next"] > ro = makeKeywordTree ["rows only", "row only"]
> *> valueExpr > ansiFetch = fs *> valueExpr <* ro
> <* choice [keyword_ "rows",keyword_ "row"]
> <* keyword_ "only"
> limit = keyword_ "limit" *> valueExpr > limit = keyword_ "limit" *> valueExpr
== common table expressions == common table expressions
@ -1070,9 +1111,6 @@ todo: work out the symbol parsing better
helper function to improve error messages helper function to improve error messages
> keywords :: [String] -> Parser [String]
> keywords ks = mapM keyword ks <?> intercalate " " ks
> keywords_ :: [String] -> Parser () > keywords_ :: [String] -> Parser ()
> keywords_ ks = mapM_ keyword_ ks <?> intercalate " " ks > keywords_ ks = mapM_ keyword_ ks <?> intercalate " " ks

View file

@ -139,7 +139,7 @@
> -- e.g. a[3]. The first > -- e.g. a[3]. The first
> -- valueExpr is the array, the > -- valueExpr is the array, the
> -- second is the subscripts/ctor args > -- second is the subscripts/ctor args
> | ArrayCtor QueryExpr -- ^ this is used for the query expression version of array constructors, e.g. array(select * from t)> deriving (Eq,Show,Read,Data,Typeable) > | ArrayCtor QueryExpr -- ^ this is used for the query expression version of array constructors, e.g. array(select * from t)
> | CSStringLit String String > | CSStringLit String String
> | Escape ValueExpr Char > | Escape ValueExpr Char
> | UEscape ValueExpr Char > | UEscape ValueExpr Char

32
TODO
View file

@ -14,7 +14,8 @@ represent missing optional bits in the ast as nothing instead of the
look at fixing the expression parsing completely look at fixing the expression parsing completely
represent natural and using/on in the syntax more close to the represent natural and using/on in the syntax more close to the
concrete syntax - don't combine in the ast concrete syntax - don't combine in the ast
review haddock in the syntax and update
review syntax names and representation
careful review of token parses wrt trailing delimiters/junk careful review of token parses wrt trailing delimiters/junk
review places in the parse which should allow only a fixed set of review places in the parse which should allow only a fixed set of
@ -26,10 +27,12 @@ decide whether to represent numeric literals better, instead of a
rough SQL 2003 todo, including tests to write: rough SQL 2003 todo, including tests to write:
switch TypedLit to CSStringLit based on first char being underscore?
idens: "", unicode, charset?, check dotted idens and contexts idens: "", unicode, charset?, check dotted idens and contexts
add missing type name support: lots of missing ones here, including add missing type name support: lots of missing ones here, including
simple stuff like lob variations, and new things like interval, simple stuff like lob variations, and new things like interval,
row, ref, scope, array, multiset type names. row, ref, scope, array, multiset type names.
add tests for all the typenames cast and typed literal
date and time literals date and time literals
multisets multisets
review window functions, window clause review window functions, window clause
@ -43,13 +46,15 @@ filter in aggs
within group in aggs within group in aggs
rows review rows review
support needed MODULE syntax in identifiers - already covered? LNR: maybe leave until after next release
decide how to represent special identifiers including the session
LNR support needed MODULE syntax in identifiers - already covered?
LNR decide how to represent special identifiers including the session
variables or whatever they are called like current_user variables or whatever they are called like current_user
grouping - needs special syntax? LNR grouping - needs special syntax?
next value for LNR next value for
probably leave for now: subtypes, methods, new /routine, dereference LNR subtypes, methods, new /routine, dereference
double check associativity, precedence (value exprs, joins, set ops) LNR double check associativity, precedence (value exprs, joins, set ops)
position expressions position expressions
length expressions length expressions
extract expression extract expression
@ -61,7 +66,7 @@ convert
translate translate
trim trim
overlay overlay
specifictype LNR specifictype
datetime value expressions datetime value expressions
intervals intervals
row value constructors, expressions review row value constructors, expressions review
@ -80,21 +85,24 @@ between symmetric/asymmetric
in predicate review in predicate review
escape for like escape for like
escape for [not] similar to escape for [not] similar to
regular expression syntax? LNR regular expression syntax?
normalized predicate normalized predicate
overlaps predicate overlaps predicate
distinct from predicate distinct from predicate
member predicate member predicate
submultiset predicate submultiset predicate
set predicate set predicate
type predicate LNR type predicate
additional stuff review: additional stuff review:
interval stuff interval stuff
aggregate functions: lots of missing bits aggregate functions: lots of missing bits
especially: filter where, within group especially: filter where, within group
complete list of keywords/reserved keywords complete list of keywords/reserved keywords
select into LNR select into
other language format identifiers for host params? LNR other language format identifiers for host params?
----
above not marked LNR are for next release
review areas where this parser is too permissive, e.g. value review areas where this parser is too permissive, e.g. value
expressions allowed where column reference names only should be expressions allowed where column reference names only should be

View file

@ -132,8 +132,7 @@ Tests for parsing value expressions
> unaryOperators :: TestItem > unaryOperators :: TestItem
> unaryOperators = Group "unaryOperators" $ map (uncurry TestValueExpr) > unaryOperators = Group "unaryOperators" $ map (uncurry TestValueExpr)
> [("not a", PrefixOp "not" $ Iden "a") > [("not a", PrefixOp "not" $ Iden "a")
> -- I think this is a missing feature or bug in parsec buildExpressionParser > ,("not not a", PrefixOp "not" $ PrefixOp "not" $ Iden "a")
> --,("not not a", PrefixOp "not" $ PrefixOp "not" $ Iden "a")
> ,("+a", PrefixOp "+" $ Iden "a") > ,("+a", PrefixOp "+" $ Iden "a")
> ,("-a", PrefixOp "-" $ Iden "a") > ,("-a", PrefixOp "-" $ Iden "a")
> ] > ]