From ca910b8e6dd4d0eddcfbdf15c5fad51e80f1d33f Mon Sep 17 00:00:00 2001 From: Jake Wheat Date: Sat, 31 Aug 2019 16:15:17 +0100 Subject: [PATCH] tweaks to the keyword list and handling --- Language/SQL/SimpleSQL/Dialect.lhs | 202 +++++++++++++++-------------- Language/SQL/SimpleSQL/Parse.lhs | 81 ++---------- 2 files changed, 116 insertions(+), 167 deletions(-) diff --git a/Language/SQL/SimpleSQL/Dialect.lhs b/Language/SQL/SimpleSQL/Dialect.lhs index 17229cf..710abe7 100644 --- a/Language/SQL/SimpleSQL/Dialect.lhs +++ b/Language/SQL/SimpleSQL/Dialect.lhs @@ -92,6 +92,18 @@ Data types to represent different dialect options > addLimit d = d {diKeywords = "limit": diKeywords d > ,diLimit = True} +todo: review this list +add tests + +think about how to say if something can safely be made a non keyword +(assuming can only be total keyword or not keyword at all) +-> if something can't appear in a scalar expression or next to one, +then I think it's pretty safe + +mostly, things are keywords to avoid them mistakenly being parsed as +aliases or as identifiers/functions/function-like things (aggs, +windows, etc.) + > ansi2011ReservedKeywords :: [String] > ansi2011ReservedKeywords = > [--"abs" -- function @@ -108,78 +120,78 @@ Data types to represent different dialect options > ,"asensitive" -- keyword > ,"asymmetric" -- keyword > ,"at" -- keyword -> ,"atomic" -> ,"authorization" -> --,"avg" -> ,"begin" -> ,"begin_frame" -> ,"begin_partition" -> ,"between" -> ,"bigint" -> ,"binary" -> ,"blob" -> ,"boolean" -> ,"both" -> ,"by" -> ,"call" -> ,"called" -> ,"cardinality" -> ,"cascaded" -> ,"case" -> ,"cast" -> ,"ceil" -> ,"ceiling" -> ,"char" -> --,"char_length" -> ,"character" -> --,"character_length" -> ,"check" -> ,"clob" -> ,"close" -> ,"coalesce" -> ,"collate" -> --,"collect" -> ,"column" -> ,"commit" -> ,"condition" -> ,"connect" -> ,"constraint" -> ,"contains" -> --,"convert" -> --,"corr" -> ,"corresponding" -> --,"count" -> --,"covar_pop" -> --,"covar_samp" -> ,"create" -> ,"cross" -> ,"cube" -> --,"cume_dist" -> ,"current" -> ,"current_catalog" -> --,"current_date" -> --,"current_default_transform_group" -> --,"current_path" -> --,"current_role" -> ,"current_row" -> ,"current_schema" -> ,"current_time" -> --,"current_timestamp" -> ,"current_transform_group_for_type" -> --,"current_user" -> ,"cursor" -> ,"cycle" -> ,"date" -> --,"day" -> ,"deallocate" -> ,"dec" -> ,"decimal" -> ,"declare" -> --,"default" -> ,"delete" -> --,"dense_rank" -> ,"deref" -> ,"describe" +> ,"atomic" -- keyword +> ,"authorization" -- keyword +> --,"avg" -- function +> ,"begin" -- keyword +> --,"begin_frame" -- identifier +> --,"begin_partition" -- identifier +> ,"between" -- keyword +> ,"bigint" -- type +> ,"binary" -- type +> ,"blob" -- type +> ,"boolean" -- type +> ,"both" -- keyword +> ,"by" -- keyword +> ,"call" -- keyword +> ,"called" -- keyword +> -- ,"cardinality" -- function + identifier? +> ,"cascaded" -- keyword +> ,"case" -- keyword +> ,"cast" -- special function +> -- ,"ceil" -- function +> -- ,"ceiling" -- function +> ,"char" -- type (+ keyword?) +> --,"char_length" -- function +> ,"character" -- type +> --,"character_length" -- function +> ,"check" -- keyword +> ,"clob" -- type +> ,"close" -- keyword +> -- ,"coalesce" -- function +> ,"collate" -- keyword +> --,"collect" -- function +> ,"column" -- keyword +> ,"commit" -- keyword +> ,"condition" -- keyword +> ,"connect" -- keyword +> ,"constraint" --keyword +> --,"contains" -- keyword? +> --,"convert" -- function? +> --,"corr" -- function +> ,"corresponding" --keyword +> --,"count" --function +> --,"covar_pop" -- function +> --,"covar_samp" --function +> ,"create" -- keyword +> ,"cross" -- keyword +> ,"cube" -- keyword +> --,"cume_dist" -- function +> ,"current" -- keyword +> -- ,"current_catalog" --identifier? +> --,"current_date" -- identifier +> --,"current_default_transform_group" -- identifier +> --,"current_path" -- identifier +> --,"current_role" -- identifier +> -- ,"current_row" -- identifier +> -- ,"current_schema" -- identifier +> -- ,"current_time" -- identifier +> --,"current_timestamp" -- identifier +> --,"current_transform_group_for_type" -- identifier, or keyword? +> --,"current_user" -- identifier +> ,"cursor" -- keyword +> ,"cycle" --keyword +> ,"date" -- type +> ,"day" -- keyword? +> ,"deallocate" -- keyword +> ,"dec" -- type +> ,"decimal" -- type +> ,"declare" -- keyword +> --,"default" -- identifier + keyword +> ,"delete" -- keyword +> --,"dense_rank" -- functino +> ,"deref" -- keyword +> ,"describe" -- keyword > ,"deterministic" > ,"disconnect" > ,"distinct" @@ -190,9 +202,9 @@ Data types to represent different dialect options > --,"element" > ,"else" > ,"end" -> ,"end_frame" -> ,"end_partition" -> ,"end-exec" +> -- ,"end_frame" -- identifier +> -- ,"end_partition" -- identifier +> ,"end-exec" -- no idea what this is > ,"equals" > ,"escape" > --,"every" @@ -206,12 +218,12 @@ Data types to represent different dialect options > --,"false" > ,"fetch" > ,"filter" -> ,"first_value" +> -- ,"first_value" > ,"float" -> ,"floor" +> --,"floor" > ,"for" > ,"foreign" -> ,"frame_row" +> -- ,"frame_row" -- identifier > ,"free" > ,"from" > ,"full" @@ -225,7 +237,7 @@ Data types to represent different dialect options > ,"groups" > ,"having" > ,"hold" -> --,"hour" +> ,"hour" > ,"identity" > ,"in" > ,"indicator" @@ -241,21 +253,21 @@ Data types to represent different dialect options > ,"into" > ,"is" > ,"join" -> ,"lag" +> --,"lag" > ,"language" > ,"large" -> ,"last_value" +> --,"last_value" > ,"lateral" -> ,"lead" +> --,"lead" > ,"leading" > ,"left" > ,"like" > ,"like_regex" -> ,"ln" +> --,"ln" > ,"local" > ,"localtime" > ,"localtimestamp" -> ,"lower" +> --,"lower" > ,"match" > --,"max" > ,"member" @@ -263,7 +275,7 @@ Data types to represent different dialect options > ,"method" > --,"min" > --,"minute" -> ,"mod" +> --,"mod" > ,"modifies" > --,"module" > --,"month" @@ -277,10 +289,10 @@ Data types to represent different dialect options > ,"none" > ,"normalize" > ,"not" -> ,"nth_value" +> --,"nth_value" > ,"ntile" > --,"null" -> ,"nullif" +> --,"nullif" > ,"numeric" > ,"octet_length" > ,"occurrences_regex" @@ -307,7 +319,7 @@ Data types to represent different dialect options > ,"portion" > ,"position" > ,"position_regex" -> ,"power" +> --,"power" > ,"precedes" > ,"precision" > ,"prepare" @@ -339,7 +351,7 @@ Data types to represent different dialect options > ,"rollback" > ,"rollup" > --,"row" -> ,"row_number" +> --,"row_number" > ,"rows" > ,"savepoint" > ,"scope" @@ -359,19 +371,19 @@ Data types to represent different dialect options > ,"sqlexception" > ,"sqlstate" > ,"sqlwarning" -> ,"sqrt" +> --,"sqrt" > --,"start" > ,"static" > --,"stddev_pop" > --,"stddev_samp" > ,"submultiset" -> ,"substring" +> --,"substring" > ,"substring_regex" > ,"succeeds" > --,"sum" > ,"symmetric" > ,"system" -> ,"system_time" +> --,"system_time" > --,"system_user" > ,"table" > ,"tablesample" @@ -388,8 +400,8 @@ Data types to represent different dialect options > ,"treat" > ,"trigger" > ,"truncate" -> ,"trim" -> ,"trim_array" +> --,"trim" +> --,"trim_array" > --,"true" > ,"uescape" > ,"union" @@ -412,7 +424,7 @@ Data types to represent different dialect options > ,"when" > ,"whenever" > ,"where" -> ,"width_bucket" +> --,"width_bucket" > ,"window" > ,"with" > ,"within" diff --git a/Language/SQL/SimpleSQL/Parse.lhs b/Language/SQL/SimpleSQL/Parse.lhs index 7eccfc7..39d755d 100644 --- a/Language/SQL/SimpleSQL/Parse.lhs +++ b/Language/SQL/SimpleSQL/Parse.lhs @@ -733,66 +733,7 @@ all the scalar expressions which start with an identifier > then return [Name Nothing x] > else fail "" > in unquotedIdentifierTok [] Nothing >>= makeKeywordFunction -> keywordFunctionNames = [{-"abs" -> ,"all" -> ,"any" -> ,"array_agg" -> ,"avg" -> ,"ceil" -> ,"ceiling" -> ,"char_length" -> ,"character_length" -> ,"coalesce" -> ,"collect" -> ,"contains" -> ,"convert" -> ,"corr" -> ,"covar_pop" -> ,"covar_samp" -> ,"count" -> ,"cume_dist" -> ,"grouping" -> ,"intersection" -> ,"ln" -> ,"max" -> ,"mod" -> ,"percent_rank" -> ,"percentile_cont" -> ,"percentile_disc" -> ,"power" -> ,"rank" -> ,"regr_avgx" -> ,"regr_avgy" -> ,"regr_count" -> ,"regr_intercept" -> ,"regr_r2" -> ,"regr_slope" -> ,"regr_sxx" -> ,"regr_sxy" -> ,"regr_syy" -> ,"row" -> ,"row_number" -> ,-}"set"{- -> ,"some" -> ,"stddev_pop" -> ,"stddev_samp" -> ,"sum" -> ,"upper" -> ,"var_pop" -> ,"var_samp" -> ,"width_bucket" -> -- window functions added here too -> ,"row_number" -> ,"rank" -> ,"dense_rank" -> ,"percent_rank" -> ,"cume_dist" -> ,"ntile" -> ,"lead" -> ,"lag" -> ,"first_value" -> ,"last_value" -> ,"nth_value"-} +> keywordFunctionNames = ["set" > ] @@ -2218,6 +2159,10 @@ special case parsing code to handle this (in the case of set), or it is not treated as a keyword (not perfect, but if it more or less works, ok for now). +An exception to this is the standard type names are considered as +keywords at the moment, with a special case in the type parser to +make this work. Maybe this isn't necessary or is a bad idea. + It is possible to have a problem if you remove something which is a keyword from this list, and still want to parse statements using it as a keyword - for instance, removing things like 'from' or 'as', @@ -2227,8 +2172,9 @@ will likely mean many things don't parse anymore. ----------- -bit hacky, used to make the dialect available during parsing so -different parsers can be used for different dialects +Used to make the dialect available during parsing so different parsers +can be used for different dialects. Not sure if this is the best way +to do it, but it's convenient > type ParseState = Dialect @@ -2241,14 +2187,5 @@ different parsers can be used for different dialects > d <- getState > guard (f d) -TODO: the ParseState and the Dialect argument should be turned into a -flags struct. Part (or all?) of this struct is the dialect -information, but each dialect has different versions + a big set of -flags to control syntax variations within a version of a product -dialect (for instance, string and identifier parsing rules vary from -dialect to dialect and version to version, and most or all SQL DBMSs -appear to have a set of flags to further enable or disable variations -for quoting and escaping strings and identifiers). - -The dialect stuff can also be used for custom options: e.g. to only +The dialect stuff could also be used for custom options: e.g. to only parse dml for instance.