1
Fork 0

start refactoring the dialect support, and work on the keyword handling

This commit is contained in:
Jake Wheat 2019-08-31 11:57:28 +01:00
parent 3707a09cb8
commit eb45eb8705
10 changed files with 379 additions and 382 deletions

View file

@ -4,13 +4,14 @@ Data types to represent different dialect options
> {-# LANGUAGE DeriveDataTypeable #-}
> module Language.SQL.SimpleSQL.Dialect
> (SyntaxFlavour(..)
> ,Dialect(..)
> (Dialect(..)
> ,SyntaxFlavour(..)
> ,ansi2011
> ,mysql
> ,postgres
> ,oracle
> ,sqlserver
> ,ansi2011ReservedKeywords
> ) where
> import Data.Data
@ -27,28 +28,365 @@ hack for now, later will expand to flags on a feature by feature basis
> -- | Used to set the dialect used for parsing and pretty printing,
> -- very unfinished at the moment.
> data Dialect = Dialect {diSyntaxFlavour :: SyntaxFlavour
> ,allowOdbc :: Bool}
> data Dialect = Dialect {diKeywords :: [String]
> ,diSyntaxFlavour :: SyntaxFlavour
> ,diFetchFirst :: Bool
> ,diLimit :: Bool
> ,diOdbc :: Bool}
> deriving (Eq,Show,Read,Data,Typeable)
> -- | ansi sql 2011 dialect
> ansi2011 :: Dialect
> ansi2011 = Dialect ANSI2011 False
> ansi2011 = Dialect {diKeywords = ansi2011ReservedKeywords
> ,diSyntaxFlavour = ANSI2011
> ,diFetchFirst = True
> ,diLimit = False
> ,diOdbc = False}
> -- | mysql dialect
> mysql :: Dialect
> mysql = Dialect MySQL False
> mysql = addLimit ansi2011 {diSyntaxFlavour = MySQL}
> -- | postgresql dialect
> postgres :: Dialect
> postgres = Dialect Postgres False
> postgres = addLimit ansi2011 {diSyntaxFlavour = Postgres}
> -- | oracle dialect
> oracle :: Dialect
> oracle = Dialect Oracle False
> oracle = ansi2011 {diSyntaxFlavour = Oracle}
> -- | microsoft sql server dialect
> sqlserver :: Dialect
> sqlserver = Dialect SQLServer False
> sqlserver = ansi2011 {diSyntaxFlavour = SQLServer}
> addLimit :: Dialect -> Dialect
> addLimit d = d {diKeywords = "limit": diKeywords d
> ,diLimit = True}
> ansi2011ReservedKeywords :: [String]
> ansi2011ReservedKeywords =
> [--"abs" -- function
> "all" -- keyword only?
> ,"allocate" -- keyword
> ,"alter" -- keyword
> ,"and" -- keyword
> --,"any" -- keyword? and function
> ,"are" -- keyword
> ,"array" -- keyword, and used in some special places, like array[...], and array(subquery)
> --,"array_agg" -- function
> -- ,"array_max_cardinality" -- function
> ,"as" -- keyword
> ,"asensitive" -- keyword
> ,"asymmetric" -- keyword
> ,"at" -- keyword
> ,"atomic"
> ,"authorization"
> --,"avg"
> ,"begin"
> ,"begin_frame"
> ,"begin_partition"
> ,"between"
> ,"bigint"
> ,"binary"
> ,"blob"
> ,"boolean"
> ,"both"
> ,"by"
> ,"call"
> ,"called"
> ,"cardinality"
> ,"cascaded"
> ,"case"
> ,"cast"
> ,"ceil"
> ,"ceiling"
> ,"char"
> --,"char_length"
> ,"character"
> --,"character_length"
> ,"check"
> ,"clob"
> ,"close"
> ,"coalesce"
> ,"collate"
> --,"collect"
> ,"column"
> ,"commit"
> ,"condition"
> ,"connect"
> ,"constraint"
> ,"contains"
> --,"convert"
> --,"corr"
> ,"corresponding"
> --,"count"
> --,"covar_pop"
> --,"covar_samp"
> ,"create"
> ,"cross"
> ,"cube"
> --,"cume_dist"
> ,"current"
> ,"current_catalog"
> --,"current_date"
> --,"current_default_transform_group"
> --,"current_path"
> --,"current_role"
> ,"current_row"
> ,"current_schema"
> ,"current_time"
> --,"current_timestamp"
> ,"current_transform_group_for_type"
> --,"current_user"
> ,"cursor"
> ,"cycle"
> ,"date"
> --,"day"
> ,"deallocate"
> ,"dec"
> ,"decimal"
> ,"declare"
> --,"default"
> ,"delete"
> --,"dense_rank"
> ,"deref"
> ,"describe"
> ,"deterministic"
> ,"disconnect"
> ,"distinct"
> ,"double"
> ,"drop"
> ,"dynamic"
> ,"each"
> --,"element"
> ,"else"
> ,"end"
> ,"end_frame"
> ,"end_partition"
> ,"end-exec"
> ,"equals"
> ,"escape"
> --,"every"
> ,"except"
> ,"exec"
> ,"execute"
> ,"exists"
> ,"exp"
> ,"external"
> ,"extract"
> --,"false"
> ,"fetch"
> ,"filter"
> ,"first_value"
> ,"float"
> ,"floor"
> ,"for"
> ,"foreign"
> ,"frame_row"
> ,"free"
> ,"from"
> ,"full"
> ,"function"
> --,"fusion"
> ,"get"
> ,"global"
> ,"grant"
> ,"group"
> --,"grouping"
> ,"groups"
> ,"having"
> ,"hold"
> --,"hour"
> ,"identity"
> ,"in"
> ,"indicator"
> ,"inner"
> ,"inout"
> ,"insensitive"
> ,"insert"
> ,"int"
> ,"integer"
> ,"intersect"
> --,"intersection"
> ,"interval"
> ,"into"
> ,"is"
> ,"join"
> ,"lag"
> ,"language"
> ,"large"
> ,"last_value"
> ,"lateral"
> ,"lead"
> ,"leading"
> ,"left"
> ,"like"
> ,"like_regex"
> ,"ln"
> ,"local"
> ,"localtime"
> ,"localtimestamp"
> ,"lower"
> ,"match"
> --,"max"
> ,"member"
> ,"merge"
> ,"method"
> --,"min"
> --,"minute"
> ,"mod"
> ,"modifies"
> --,"module"
> --,"month"
> ,"multiset"
> ,"national"
> ,"natural"
> ,"nchar"
> ,"nclob"
> ,"new"
> ,"no"
> ,"none"
> ,"normalize"
> ,"not"
> ,"nth_value"
> ,"ntile"
> --,"null"
> ,"nullif"
> ,"numeric"
> ,"octet_length"
> ,"occurrences_regex"
> ,"of"
> ,"offset"
> ,"old"
> ,"on"
> ,"only"
> ,"open"
> ,"or"
> ,"order"
> ,"out"
> ,"outer"
> ,"over"
> ,"overlaps"
> ,"overlay"
> ,"parameter"
> ,"partition"
> ,"percent"
> --,"percent_rank"
> --,"percentile_cont"
> --,"percentile_disc"
> ,"period"
> ,"portion"
> ,"position"
> ,"position_regex"
> ,"power"
> ,"precedes"
> ,"precision"
> ,"prepare"
> ,"primary"
> ,"procedure"
> ,"range"
> --,"rank"
> ,"reads"
> ,"real"
> ,"recursive"
> ,"ref"
> ,"references"
> ,"referencing"
> --,"regr_avgx"
> --,"regr_avgy"
> --,"regr_count"
> --,"regr_intercept"
> --,"regr_r2"
> --,"regr_slope"
> --,"regr_sxx"
> --,"regr_sxy"
> --,"regr_syy"
> ,"release"
> ,"result"
> ,"return"
> ,"returns"
> ,"revoke"
> ,"right"
> ,"rollback"
> ,"rollup"
> --,"row"
> ,"row_number"
> ,"rows"
> ,"savepoint"
> ,"scope"
> ,"scroll"
> ,"search"
> --,"second"
> ,"select"
> ,"sensitive"
> --,"session_user"
> ,"set"
> ,"similar"
> ,"smallint"
> --,"some"
> ,"specific"
> ,"specifictype"
> ,"sql"
> ,"sqlexception"
> ,"sqlstate"
> ,"sqlwarning"
> ,"sqrt"
> --,"start"
> ,"static"
> --,"stddev_pop"
> --,"stddev_samp"
> ,"submultiset"
> ,"substring"
> ,"substring_regex"
> ,"succeeds"
> --,"sum"
> ,"symmetric"
> ,"system"
> ,"system_time"
> --,"system_user"
> ,"table"
> ,"tablesample"
> ,"then"
> ,"time"
> ,"timestamp"
> ,"timezone_hour"
> ,"timezone_minute"
> ,"to"
> ,"trailing"
> ,"translate"
> ,"translate_regex"
> ,"translation"
> ,"treat"
> ,"trigger"
> ,"truncate"
> ,"trim"
> ,"trim_array"
> --,"true"
> ,"uescape"
> ,"union"
> ,"unique"
> --,"unknown"
> ,"unnest"
> ,"update"
> ,"upper"
> --,"user"
> ,"using"
> --,"value"
> ,"values"
> ,"value_of"
> --,"var_pop"
> --,"var_samp"
> ,"varbinary"
> ,"varchar"
> ,"varying"
> ,"versioning"
> ,"when"
> ,"whenever"
> ,"where"
> ,"width_bucket"
> ,"window"
> ,"with"
> ,"within"
> ,"without"
> --,"year"
> ]

View file

@ -346,7 +346,7 @@ compared with ansi and other dialects
> then postgresExtraSymbols
> else []
> ,miscSymbol
> ,if allowOdbc d then odbcSymbol else []
> ,if diOdbc d then odbcSymbol else []
> ,if (diSyntaxFlavour d == Postgres)
> then generalizedPostgresqlOperator
> else basicAnsiOps

View file

@ -185,7 +185,7 @@ fixing them in the syntax but leaving them till the semantic checking
> import Control.Monad.Identity (Identity)
> import Control.Monad (guard, void)
> import Control.Applicative ((<$), (<$>), (<*>) ,(<*), (*>), (<**>), pure)
> import Control.Applicative ((<**>))
> import Data.Char (toLower, isDigit)
> import Text.Parsec (setPosition,setSourceColumn,setSourceLine,getPosition
> ,option,between,sepBy,sepBy1
@ -202,7 +202,7 @@ fixing them in the syntax but leaving them till the semantic checking
> import Language.SQL.SimpleSQL.Syntax
> import Language.SQL.SimpleSQL.Combinators
> import Language.SQL.SimpleSQL.Errors
> import Language.SQL.SimpleSQL.Dialect
> --import Language.SQL.SimpleSQL.Dialect
> import qualified Language.SQL.SimpleSQL.Lex as L
> import Data.Maybe
> import Text.Parsec.String (GenParser)
@ -719,26 +719,21 @@ all the scalar expressions which start with an identifier
> idenExpr =
> -- todo: work out how to left factor this
> try (TypedLit <$> typeName <*> singleQuotesOnlyStringTok)
> -- <|> multisetSetFunction
> <|> (try keywordFunction <**> app)
> <|> (names <**> option Iden app)
> where
> -- this is a special case because 'set' is a reserved keyword
> -- and the names parser won't parse it
> -- can't remove it from the reserved keyword list, because
> -- it is used in a lot of places which are ambiguous as a keyword
> -- this approach might be needed with other keywords which look
> -- like identifiers or functions
> keywordFunction =
> let makeKeywordFunction x = if map toLower x `elem` keywordFunctionNames
> then return [Name Nothing x]
> else fail ""
> in unquotedIdentifierTok [] Nothing >>= makeKeywordFunction
> -- todo: this list should be in the dialects
> -- we should have tests to check these work
> -- we should have tests to check if they are used elsewhere, you
> -- get a keyword failure
> -- these are the names of functions which are also keywords
> -- so this identifier can only be used unquoted for a function application
> -- and nowhere else
> -- not sure if this list is 100% correct
> -- todo: make a corresponding list of reserved keywords which can be
> -- parsed as an identifier
> keywordFunctionNames = ["abs"
> keywordFunctionNames = [{-"abs"
> ,"all"
> ,"any"
> ,"array_agg"
@ -777,7 +772,7 @@ all the scalar expressions which start with an identifier
> ,"regr_syy"
> ,"row"
> ,"row_number"
> ,"set"
> ,-}"set"{-
> ,"some"
> ,"stddev_pop"
> ,"stddev_samp"
@ -797,7 +792,7 @@ all the scalar expressions which start with an identifier
> ,"lag"
> ,"first_value"
> ,"last_value"
> ,"nth_value"
> ,"nth_value"-}
> ]
@ -2197,7 +2192,7 @@ helper function to improve error messages
> commaSep1 = (`sepBy1` comma)
> blacklist :: Dialect -> [String]
> blacklist = reservedWord
> blacklist d = diKeywords d
These blacklisted names are mostly needed when we parse something with
an optional alias, e.g. select a a from t. If we write select a from
@ -2207,347 +2202,21 @@ could be tuned differently for each place the identifierString/
identifier parsers are used to only blacklist the bare
minimum. Something like this might be needed for dialect support, even
if it is pretty silly to use a keyword as an unquoted identifier when
there is a effing quoting syntax as well.
there is a quoting syntax as well.
The standard has a weird mix of reserved keywords and unreserved
keywords (I'm not sure what exactly being an unreserved keyword
means).
can't work out if aggregate functions are supposed to be reserved or
not, leave them unreserved for now
> reservedWord :: Dialect -> [String]
> reservedWord d | diSyntaxFlavour d == ANSI2011 =
> ["abs"
> --,"all"
> ,"allocate"
> ,"alter"
> ,"and"
> --,"any"
> ,"are"
> ,"array"
> --,"array_agg"
> ,"array_max_cardinality"
> ,"as"
> ,"asensitive"
> ,"asymmetric"
> ,"at"
> ,"atomic"
> ,"authorization"
> --,"avg"
> ,"begin"
> ,"begin_frame"
> ,"begin_partition"
> ,"between"
> ,"bigint"
> ,"binary"
> ,"blob"
> ,"boolean"
> ,"both"
> ,"by"
> ,"call"
> ,"called"
> ,"cardinality"
> ,"cascaded"
> ,"case"
> ,"cast"
> ,"ceil"
> ,"ceiling"
> ,"char"
> ,"char_length"
> ,"character"
> ,"character_length"
> ,"check"
> ,"clob"
> ,"close"
> ,"coalesce"
> ,"collate"
> --,"collect"
> ,"column"
> ,"commit"
> ,"condition"
> ,"connect"
> ,"constraint"
> ,"contains"
> ,"convert"
> --,"corr"
> ,"corresponding"
> --,"count"
> --,"covar_pop"
> --,"covar_samp"
> ,"create"
> ,"cross"
> ,"cube"
> --,"cume_dist"
> ,"current"
> ,"current_catalog"
> --,"current_date"
> --,"current_default_transform_group"
> --,"current_path"
> --,"current_role"
> ,"current_row"
> ,"current_schema"
> ,"current_time"
> --,"current_timestamp"
> ,"current_transform_group_for_type"
> --,"current_user"
> ,"cursor"
> ,"cycle"
> ,"date"
> --,"day"
> ,"deallocate"
> ,"dec"
> ,"decimal"
> ,"declare"
> --,"default"
> ,"delete"
> --,"dense_rank"
> ,"deref"
> ,"describe"
> ,"deterministic"
> ,"disconnect"
> ,"distinct"
> ,"double"
> ,"drop"
> ,"dynamic"
> ,"each"
> --,"element"
> ,"else"
> ,"end"
> ,"end_frame"
> ,"end_partition"
> ,"end-exec"
> ,"equals"
> ,"escape"
> --,"every"
> ,"except"
> ,"exec"
> ,"execute"
> ,"exists"
> ,"exp"
> ,"external"
> ,"extract"
> --,"false"
> ,"fetch"
> ,"filter"
> ,"first_value"
> ,"float"
> ,"floor"
> ,"for"
> ,"foreign"
> ,"frame_row"
> ,"free"
> ,"from"
> ,"full"
> ,"function"
> --,"fusion"
> ,"get"
> ,"global"
> ,"grant"
> ,"group"
> --,"grouping"
> ,"groups"
> ,"having"
> ,"hold"
> --,"hour"
> ,"identity"
> ,"in"
> ,"indicator"
> ,"inner"
> ,"inout"
> ,"insensitive"
> ,"insert"
> ,"int"
> ,"integer"
> ,"intersect"
> --,"intersection"
> ,"interval"
> ,"into"
> ,"is"
> ,"join"
> ,"lag"
> ,"language"
> ,"large"
> ,"last_value"
> ,"lateral"
> ,"lead"
> ,"leading"
> ,"left"
> ,"like"
> ,"like_regex"
> ,"ln"
> ,"local"
> ,"localtime"
> ,"localtimestamp"
> ,"lower"
> ,"match"
> --,"max"
> ,"member"
> ,"merge"
> ,"method"
> --,"min"
> --,"minute"
> ,"mod"
> ,"modifies"
> --,"module"
> --,"month"
> ,"multiset"
> ,"national"
> ,"natural"
> ,"nchar"
> ,"nclob"
> ,"new"
> ,"no"
> ,"none"
> ,"normalize"
> ,"not"
> ,"nth_value"
> ,"ntile"
> --,"null"
> ,"nullif"
> ,"numeric"
> ,"octet_length"
> ,"occurrences_regex"
> ,"of"
> ,"offset"
> ,"old"
> ,"on"
> ,"only"
> ,"open"
> ,"or"
> ,"order"
> ,"out"
> ,"outer"
> ,"over"
> ,"overlaps"
> ,"overlay"
> ,"parameter"
> ,"partition"
> ,"percent"
> --,"percent_rank"
> --,"percentile_cont"
> --,"percentile_disc"
> ,"period"
> ,"portion"
> ,"position"
> ,"position_regex"
> ,"power"
> ,"precedes"
> ,"precision"
> ,"prepare"
> ,"primary"
> ,"procedure"
> ,"range"
> --,"rank"
> ,"reads"
> ,"real"
> ,"recursive"
> ,"ref"
> ,"references"
> ,"referencing"
> --,"regr_avgx"
> --,"regr_avgy"
> --,"regr_count"
> --,"regr_intercept"
> --,"regr_r2"
> --,"regr_slope"
> --,"regr_sxx"
> --,"regr_sxy"
> --,"regr_syy"
> ,"release"
> ,"result"
> ,"return"
> ,"returns"
> ,"revoke"
> ,"right"
> ,"rollback"
> ,"rollup"
> ,"row"
> ,"row_number"
> ,"rows"
> ,"savepoint"
> ,"scope"
> ,"scroll"
> ,"search"
> --,"second"
> ,"select"
> ,"sensitive"
> --,"session_user"
> ,"set"
> ,"similar"
> ,"smallint"
> --,"some"
> ,"specific"
> ,"specifictype"
> ,"sql"
> ,"sqlexception"
> ,"sqlstate"
> ,"sqlwarning"
> ,"sqrt"
> --,"start"
> ,"static"
> --,"stddev_pop"
> --,"stddev_samp"
> ,"submultiset"
> ,"substring"
> ,"substring_regex"
> ,"succeeds"
> --,"sum"
> ,"symmetric"
> ,"system"
> ,"system_time"
> --,"system_user"
> ,"table"
> ,"tablesample"
> ,"then"
> ,"time"
> ,"timestamp"
> ,"timezone_hour"
> ,"timezone_minute"
> ,"to"
> ,"trailing"
> ,"translate"
> ,"translate_regex"
> ,"translation"
> ,"treat"
> ,"trigger"
> ,"truncate"
> ,"trim"
> ,"trim_array"
> --,"true"
> ,"uescape"
> ,"union"
> ,"unique"
> --,"unknown"
> ,"unnest"
> ,"update"
> ,"upper"
> --,"user"
> ,"using"
> --,"value"
> ,"values"
> ,"value_of"
> --,"var_pop"
> --,"var_samp"
> ,"varbinary"
> ,"varchar"
> ,"varying"
> ,"versioning"
> ,"when"
> ,"whenever"
> ,"where"
> ,"width_bucket"
> ,"window"
> ,"with"
> ,"within"
> ,"without"
> --,"year"
> ]
TODO: create this list properly
move this list into the dialect data type
> reservedWord _ = reservedWord ansi2011 ++ ["limit"]
The current approach tries to have everything which is a keyword only
in the keyword list - so it can only be used in some other context if
quoted. If something is a 'ansi keyword', but appears only as an
identifier or function name for instance in the syntax (or something
that looks identical to this), then it isn't treated as a keyword at
all. When there is some overlap (e.g. 'set'), then there is either
special case parsing code to handle this (in the case of set), or it
is not treated as a keyword (not perfect, but if it more or less
works, ok for now)
-----------

View file

@ -20,7 +20,7 @@ TODO: there should be more comments in this file, especially the bits
which have been changed to try to improve the layout of the output.
> import Language.SQL.SimpleSQL.Syntax
> import Language.SQL.SimpleSQL.Dialect
> --import Language.SQL.SimpleSQL.Dialect
> import Text.PrettyPrint (render, vcat, text, (<>), (<+>), empty, parens,
> nest, Doc, punctuate, comma, sep, quotes,
> brackets,hcat)

View file

@ -58,12 +58,7 @@
> ,AdminOptionFor(..)
> ,GrantOptionFor(..)
> -- * Dialects
> ,Dialect(allowOdbc)
> ,ansi2011
> ,mysql
> ,postgres
> ,oracle
> ,sqlserver
> ,module Language.SQL.SimpleSQL.Dialect
> -- * Comment
> ,Comment(..)
> ) where

View file

@ -10,7 +10,7 @@ build :
.PHONY : test
test :
cabal v2-test
cabal new-run test:Tests -- --hide-successes --ansi-tricks=false
.PHONY : website
website :

7
TODO
View file

@ -33,12 +33,7 @@ themselves
review main missing sql bits - focus on more mainstream things
could also review main dialects
** review the dialect support implementation
-> how to create your own dialects
especially how to override the reserved keyword list easily
make a list of the current dialect specific things
review the reserved word handling and make some more tests
add negative parsing tests for things that should fail
review the dialect support implementation
syntax from hssqlppp:

View file

@ -315,7 +315,7 @@ the + or -.
> odbcLexerTests :: TestItem
> odbcLexerTests = Group "odbcLexTests" $
> [ LexTest sqlserver {allowOdbc = True} s t | (s,t) <-
> [ LexTest sqlserver {diOdbc = True} s t | (s,t) <-
> [("{}", [Symbol "{", Symbol "}"])
> ]]
> ++ [LexFails sqlserver "{"

View file

@ -29,14 +29,14 @@
> ,iden "SQL_DATE"])
> ]
> ,Group "outer join" [
> TestQueryExpr ansi2011 {allowOdbc=True}
> TestQueryExpr ansi2011 {diOdbc=True}
> "select * from {oj t1 left outer join t2 on expr}"
> $ makeSelect
> {qeSelectList = [(Star,Nothing)]
> ,qeFrom = [TROdbc $ TRJoin (TRSimple [Name Nothing "t1"]) False JLeft (TRSimple [Name Nothing "t2"])
> (Just $ JoinOn $ Iden [Name Nothing "expr"])]}]
> ,Group "check parsing bugs" [
> TestQueryExpr ansi2011 {allowOdbc=True}
> TestQueryExpr ansi2011 {diOdbc=True}
> "select {fn CONVERT(cint,SQL_BIGINT)} from t;"
> $ makeSelect
> {qeSelectList = [(OdbcFunc (ap "CONVERT"
@ -45,7 +45,7 @@
> ,qeFrom = [TRSimple [Name Nothing "t"]]}]
> ]
> where
> e = TestScalarExpr ansi2011 {allowOdbc = True}
> e = TestScalarExpr ansi2011 {diOdbc = True}
> --tsql = ParseProcSql defaultParseFlags {pfDialect=sqlServerDialect}
> ap n = App [Name Nothing n]
> iden n = Iden [Name Nothing n]

View file

@ -5,7 +5,7 @@ Tests.lhs module for the 'interpreter'.
> module Language.SQL.SimpleSQL.TestTypes
> (TestItem(..)
> ,ansi2011,mysql,postgres,oracle,sqlserver
> ,allowOdbc) where
> ,diOdbc) where
> import Language.SQL.SimpleSQL.Syntax
> import Language.SQL.SimpleSQL.Lex (Token)