From ca910b8e6dd4d0eddcfbdf15c5fad51e80f1d33f Mon Sep 17 00:00:00 2001
From: Jake Wheat <jakewheatmail@gmail.com>
Date: Sat, 31 Aug 2019 16:15:17 +0100
Subject: [PATCH] tweaks to the keyword list and handling

---
 Language/SQL/SimpleSQL/Dialect.lhs | 202 +++++++++++++++--------------
 Language/SQL/SimpleSQL/Parse.lhs   |  81 ++----------
 2 files changed, 116 insertions(+), 167 deletions(-)

diff --git a/Language/SQL/SimpleSQL/Dialect.lhs b/Language/SQL/SimpleSQL/Dialect.lhs
index 17229cf..710abe7 100644
--- a/Language/SQL/SimpleSQL/Dialect.lhs
+++ b/Language/SQL/SimpleSQL/Dialect.lhs
@@ -92,6 +92,18 @@ Data types to represent different dialect options
 > addLimit d = d {diKeywords = "limit": diKeywords d
 >                ,diLimit = True}
 
+todo: review this list
+add tests
+
+think about how to say if something can safely be made a non keyword
+(assuming can only be total keyword or not keyword at all)
+-> if something can't appear in a scalar expression or next to one,
+then I think it's pretty safe
+
+mostly, things are keywords to avoid them mistakenly being parsed as
+aliases or as identifiers/functions/function-like things (aggs,
+windows, etc.)
+
 > ansi2011ReservedKeywords :: [String]
 > ansi2011ReservedKeywords =
 >     [--"abs" -- function
@@ -108,78 +120,78 @@ Data types to represent different dialect options
 >     ,"asensitive" -- keyword
 >     ,"asymmetric" -- keyword
 >     ,"at" -- keyword
->     ,"atomic"
->     ,"authorization"
->     --,"avg"
->     ,"begin"
->     ,"begin_frame"
->     ,"begin_partition"
->     ,"between"
->     ,"bigint"
->     ,"binary"
->     ,"blob"
->     ,"boolean"
->     ,"both"
->     ,"by"
->     ,"call"
->     ,"called"
->     ,"cardinality"
->     ,"cascaded"
->     ,"case"
->     ,"cast"
->     ,"ceil"
->     ,"ceiling"
->     ,"char"
->     --,"char_length"
->     ,"character"
->     --,"character_length"
->     ,"check"
->     ,"clob"
->     ,"close"
->     ,"coalesce"
->     ,"collate"
->     --,"collect"
->     ,"column"
->     ,"commit"
->     ,"condition"
->     ,"connect"
->     ,"constraint"
->     ,"contains"
->     --,"convert"
->     --,"corr"
->     ,"corresponding"
->     --,"count"
->     --,"covar_pop"
->     --,"covar_samp"
->     ,"create"
->     ,"cross"
->     ,"cube"
->     --,"cume_dist"
->     ,"current"
->     ,"current_catalog"
->     --,"current_date"
->     --,"current_default_transform_group"
->     --,"current_path"
->     --,"current_role"
->     ,"current_row"
->     ,"current_schema"
->     ,"current_time"
->     --,"current_timestamp"
->     ,"current_transform_group_for_type"
->     --,"current_user"
->     ,"cursor"
->     ,"cycle"
->     ,"date"
->     --,"day"
->     ,"deallocate"
->     ,"dec"
->     ,"decimal"
->     ,"declare"
->     --,"default"
->     ,"delete"
->     --,"dense_rank"
->     ,"deref"
->     ,"describe"
+>     ,"atomic" -- keyword
+>     ,"authorization" -- keyword
+>     --,"avg" -- function
+>     ,"begin" -- keyword
+>     --,"begin_frame" -- identifier
+>     --,"begin_partition" -- identifier
+>     ,"between" -- keyword
+>     ,"bigint" -- type
+>     ,"binary" -- type
+>     ,"blob" -- type
+>     ,"boolean" -- type
+>     ,"both" -- keyword
+>     ,"by" -- keyword
+>     ,"call" -- keyword
+>     ,"called" -- keyword
+>     -- ,"cardinality" -- function + identifier?
+>     ,"cascaded" -- keyword
+>     ,"case" -- keyword
+>     ,"cast" -- special function
+>     -- ,"ceil" -- function
+>     -- ,"ceiling" -- function
+>     ,"char"  -- type (+ keyword?)
+>     --,"char_length" -- function
+>     ,"character" -- type
+>     --,"character_length" -- function
+>     ,"check" -- keyword
+>     ,"clob" -- type
+>     ,"close" -- keyword
+>     -- ,"coalesce" -- function
+>     ,"collate" -- keyword
+>     --,"collect" -- function
+>     ,"column" -- keyword
+>     ,"commit" -- keyword
+>     ,"condition" -- keyword
+>     ,"connect" -- keyword
+>     ,"constraint" --keyword
+>     --,"contains" -- keyword?
+>     --,"convert" -- function?
+>     --,"corr" -- function
+>     ,"corresponding" --keyword
+>     --,"count" --function
+>     --,"covar_pop" -- function
+>     --,"covar_samp" --function
+>     ,"create" -- keyword
+>     ,"cross" -- keyword
+>     ,"cube" -- keyword
+>     --,"cume_dist" -- function
+>     ,"current" -- keyword
+>     -- ,"current_catalog" --identifier?
+>     --,"current_date" -- identifier
+>     --,"current_default_transform_group"  -- identifier
+>     --,"current_path"  -- identifier
+>     --,"current_role"  -- identifier
+>     -- ,"current_row"  -- identifier
+>     -- ,"current_schema"  -- identifier
+>     -- ,"current_time"  -- identifier
+>     --,"current_timestamp"  -- identifier
+>     --,"current_transform_group_for_type"  -- identifier, or keyword?
+>     --,"current_user" -- identifier
+>     ,"cursor" -- keyword
+>     ,"cycle" --keyword
+>     ,"date" -- type
+>     ,"day" -- keyword?
+>     ,"deallocate" -- keyword
+>     ,"dec" -- type
+>     ,"decimal" -- type
+>     ,"declare" -- keyword
+>     --,"default" -- identifier + keyword
+>     ,"delete" -- keyword
+>     --,"dense_rank" -- functino
+>     ,"deref" -- keyword
+>     ,"describe"  -- keyword
 >     ,"deterministic"
 >     ,"disconnect"
 >     ,"distinct"
@@ -190,9 +202,9 @@ Data types to represent different dialect options
 >     --,"element"
 >     ,"else"
 >     ,"end"
->     ,"end_frame"
->     ,"end_partition"
->     ,"end-exec"
+>     -- ,"end_frame"  -- identifier
+>     -- ,"end_partition"  -- identifier
+>     ,"end-exec" -- no idea what this is
 >     ,"equals"
 >     ,"escape"
 >     --,"every"
@@ -206,12 +218,12 @@ Data types to represent different dialect options
 >     --,"false"
 >     ,"fetch"
 >     ,"filter"
->     ,"first_value"
+>     -- ,"first_value"
 >     ,"float"
->     ,"floor"
+>     --,"floor"
 >     ,"for"
 >     ,"foreign"
->     ,"frame_row"
+>     -- ,"frame_row"  -- identifier
 >     ,"free"
 >     ,"from"
 >     ,"full"
@@ -225,7 +237,7 @@ Data types to represent different dialect options
 >     ,"groups"
 >     ,"having"
 >     ,"hold"
->     --,"hour"
+>     ,"hour"
 >     ,"identity"
 >     ,"in"
 >     ,"indicator"
@@ -241,21 +253,21 @@ Data types to represent different dialect options
 >     ,"into"
 >     ,"is"
 >     ,"join"
->     ,"lag"
+>     --,"lag"
 >     ,"language"
 >     ,"large"
->     ,"last_value"
+>     --,"last_value"
 >     ,"lateral"
->     ,"lead"
+>     --,"lead"
 >     ,"leading"
 >     ,"left"
 >     ,"like"
 >     ,"like_regex"
->     ,"ln"
+>     --,"ln"
 >     ,"local"
 >     ,"localtime"
 >     ,"localtimestamp"
->     ,"lower"
+>     --,"lower"
 >     ,"match"
 >     --,"max"
 >     ,"member"
@@ -263,7 +275,7 @@ Data types to represent different dialect options
 >     ,"method"
 >     --,"min"
 >     --,"minute"
->     ,"mod"
+>     --,"mod"
 >     ,"modifies"
 >     --,"module"
 >     --,"month"
@@ -277,10 +289,10 @@ Data types to represent different dialect options
 >     ,"none"
 >     ,"normalize"
 >     ,"not"
->     ,"nth_value"
+>     --,"nth_value"
 >     ,"ntile"
 >     --,"null"
->     ,"nullif"
+>     --,"nullif"
 >     ,"numeric"
 >     ,"octet_length"
 >     ,"occurrences_regex"
@@ -307,7 +319,7 @@ Data types to represent different dialect options
 >     ,"portion"
 >     ,"position"
 >     ,"position_regex"
->     ,"power"
+>     --,"power"
 >     ,"precedes"
 >     ,"precision"
 >     ,"prepare"
@@ -339,7 +351,7 @@ Data types to represent different dialect options
 >     ,"rollback"
 >     ,"rollup"
 >     --,"row"
->     ,"row_number"
+>     --,"row_number"
 >     ,"rows"
 >     ,"savepoint"
 >     ,"scope"
@@ -359,19 +371,19 @@ Data types to represent different dialect options
 >     ,"sqlexception"
 >     ,"sqlstate"
 >     ,"sqlwarning"
->     ,"sqrt"
+>     --,"sqrt"
 >     --,"start"
 >     ,"static"
 >     --,"stddev_pop"
 >     --,"stddev_samp"
 >     ,"submultiset"
->     ,"substring"
+>     --,"substring"
 >     ,"substring_regex"
 >     ,"succeeds"
 >     --,"sum"
 >     ,"symmetric"
 >     ,"system"
->     ,"system_time"
+>     --,"system_time"
 >     --,"system_user"
 >     ,"table"
 >     ,"tablesample"
@@ -388,8 +400,8 @@ Data types to represent different dialect options
 >     ,"treat"
 >     ,"trigger"
 >     ,"truncate"
->     ,"trim"
->     ,"trim_array"
+>     --,"trim"
+>     --,"trim_array"
 >     --,"true"
 >     ,"uescape"
 >     ,"union"
@@ -412,7 +424,7 @@ Data types to represent different dialect options
 >     ,"when"
 >     ,"whenever"
 >     ,"where"
->     ,"width_bucket"
+>     --,"width_bucket"
 >     ,"window"
 >     ,"with"
 >     ,"within"
diff --git a/Language/SQL/SimpleSQL/Parse.lhs b/Language/SQL/SimpleSQL/Parse.lhs
index 7eccfc7..39d755d 100644
--- a/Language/SQL/SimpleSQL/Parse.lhs
+++ b/Language/SQL/SimpleSQL/Parse.lhs
@@ -733,66 +733,7 @@ all the scalar expressions which start with an identifier
 >                                     then return [Name Nothing x]
 >                                     else fail ""
 >         in unquotedIdentifierTok [] Nothing >>= makeKeywordFunction
->     keywordFunctionNames = [{-"abs"
->                            ,"all"
->                            ,"any"
->                            ,"array_agg"
->                            ,"avg"
->                            ,"ceil"
->                            ,"ceiling"
->                            ,"char_length"
->                            ,"character_length"
->                            ,"coalesce"
->                            ,"collect"
->                            ,"contains"
->                            ,"convert"
->                            ,"corr"
->                            ,"covar_pop"
->                            ,"covar_samp"
->                            ,"count"
->                            ,"cume_dist"
->                            ,"grouping"
->                            ,"intersection"
->                            ,"ln"
->                            ,"max"
->                            ,"mod"
->                            ,"percent_rank"
->                            ,"percentile_cont"
->                            ,"percentile_disc"
->                            ,"power"
->                            ,"rank"
->                            ,"regr_avgx"
->                            ,"regr_avgy"
->                            ,"regr_count"
->                            ,"regr_intercept"
->                            ,"regr_r2"
->                            ,"regr_slope"
->                            ,"regr_sxx"
->                            ,"regr_sxy"
->                            ,"regr_syy"
->                            ,"row"
->                            ,"row_number"
->                            ,-}"set"{-
->                            ,"some"
->                            ,"stddev_pop"
->                            ,"stddev_samp"
->                            ,"sum"
->                            ,"upper"
->                            ,"var_pop"
->                            ,"var_samp"
->                            ,"width_bucket"
->                            -- window functions added here too
->                            ,"row_number"
->                            ,"rank"
->                            ,"dense_rank"
->                            ,"percent_rank"
->                            ,"cume_dist"
->                            ,"ntile"
->                            ,"lead"
->                            ,"lag"
->                            ,"first_value"
->                            ,"last_value"
->                            ,"nth_value"-}
+>     keywordFunctionNames = ["set"
 >                            ]
 
 
@@ -2218,6 +2159,10 @@ special case parsing code to handle this (in the case of set), or it
 is not treated as a keyword (not perfect, but if it more or less
 works, ok for now).
 
+An exception to this is the standard type names are considered as
+keywords at the moment, with a special case in the type parser to
+make this work. Maybe this isn't necessary or is a bad idea.
+
 It is possible to have a problem if you remove something which is a
 keyword from this list, and still want to parse statements using it
 as a keyword - for instance, removing things like 'from' or 'as',
@@ -2227,8 +2172,9 @@ will likely mean many things don't parse anymore.
 
 -----------
 
-bit hacky, used to make the dialect available during parsing so
-different parsers can be used for different dialects
+Used to make the dialect available during parsing so different parsers
+can be used for different dialects. Not sure if this is the best way
+to do it, but it's convenient
 
 > type ParseState = Dialect
 
@@ -2241,14 +2187,5 @@ different parsers can be used for different dialects
 >     d <- getState
 >     guard (f d)
 
-TODO: the ParseState and the Dialect argument should be turned into a
-flags struct. Part (or all?) of this struct is the dialect
-information, but each dialect has different versions + a big set of
-flags to control syntax variations within a version of a product
-dialect (for instance, string and identifier parsing rules vary from
-dialect to dialect and version to version, and most or all SQL DBMSs
-appear to have a set of flags to further enable or disable variations
-for quoting and escaping strings and identifiers).
-
-The dialect stuff can also be used for custom options: e.g. to only
+The dialect stuff could also be used for custom options: e.g. to only
 parse dml for instance.