1
Fork 0

add some big improvements to parse error messages

change the parser to not attempt to parse the elements following
'from' unless there is a actual 'from'
improve the symbol parser to try to deal with issues when symbols are
  next to eachother with no intervening whitespaces
improve number literal parsing to fail if there are trailing letters
  or digits which aren't part of the number and aren't separated with
  whitespace
add some code to start analysing the quality of parse error messages
This commit is contained in:
Jake Wheat 2014-04-17 18:32:41 +03:00
parent c48b057457
commit 488310ff6a
7 changed files with 298 additions and 41 deletions

View file

@ -17,7 +17,7 @@
> ,option,between,sepBy,sepBy1,string,manyTill,anyChar > ,option,between,sepBy,sepBy1,string,manyTill,anyChar
> ,try,string,many1,oneOf,digit,(<|>),choice,char,eof > ,try,string,many1,oneOf,digit,(<|>),choice,char,eof
> ,optionMaybe,optional,many,letter,parse > ,optionMaybe,optional,many,letter,parse
> ,chainl1) > ,chainl1, (<?>),notFollowedBy,alphaNum)
> import Text.Parsec.String (Parser) > import Text.Parsec.String (Parser)
> import qualified Text.Parsec as P (ParseError) > import qualified Text.Parsec as P (ParseError)
> import Text.Parsec.Perm (permute,(<$?>), (<|?>)) > import Text.Parsec.Perm (permute,(<$?>), (<|?>))
@ -580,6 +580,7 @@ fragile and could at least do with some heavy explanation.
> ,star > ,star
> ,iden > ,iden
> ,parensValue] > ,parensValue]
> <?> "value expression"
expose the b expression for window frame clause range between expose the b expression for window frame clause range between
@ -745,25 +746,48 @@ and union, etc..
> mkSelect > mkSelect
> <$> (fromMaybe All <$> duplicates) > <$> (fromMaybe All <$> duplicates)
> <*> selectList > <*> selectList
> <*> option [] from > <*> optionMaybe tableExpression
> <*> optionMaybe whereClause > mkSelect d sl Nothing =
> <*> option [] groupByClause > makeSelect{qeSetQuantifier = d, qeSelectList = sl}
> <*> optionMaybe having > mkSelect d sl (Just (TableExpression f w g h od ofs fe)) =
> <*> option [] orderBy
> <*> offsetFetch
> mkSelect d sl f w g h od (ofs,fe) =
> Select d sl f w g h od ofs fe > Select d sl f w g h od ofs fe
> values = keyword_ "values" > values = keyword_ "values"
> >> Values <$> commaSep (parens (commaSep valueExpr)) > >> Values <$> commaSep (parens (commaSep valueExpr))
> table = keyword_ "table" >> Table <$> name > table = keyword_ "table" >> Table <$> name
local data type to help with parsing the bit after the select list,
called 'table expression' in the ansi sql grammar. Maybe this should
be in the public syntax?
> data TableExpression
> = TableExpression
> {_teFrom :: [TableRef]
> ,_teWhere :: Maybe ValueExpr
> ,_teGroupBy :: [GroupingExpr]
> ,_teHaving :: Maybe ValueExpr
> ,_teOrderBy :: [SortSpec]
> ,_teOffset :: Maybe ValueExpr
> ,_teFetchFirst :: Maybe ValueExpr}
> tableExpression :: Parser TableExpression
> tableExpression =
> mkTe <$> from
> <*> optionMaybe whereClause
> <*> option [] groupByClause
> <*> optionMaybe having
> <*> option [] orderBy
> <*> offsetFetch
> where
> mkTe f w g h od (ofs,fe) =
> TableExpression f w g h od ofs fe
> queryExprSuffix :: QueryExpr -> Parser QueryExpr > queryExprSuffix :: QueryExpr -> Parser QueryExpr
> queryExprSuffix qe = > queryExprSuffix qe =
> (CombineQueryExpr qe > (CombineQueryExpr qe
> <$> choice > <$> (choice
> [Union <$ keyword_ "union" > [Union <$ keyword_ "union"
> ,Intersect <$ keyword_ "intersect" > ,Intersect <$ keyword_ "intersect"
> ,Except <$ keyword_ "except"] > ,Except <$ keyword_ "except"] <?> "set operator")
> <*> (fromMaybe Distinct <$> duplicates) > <*> (fromMaybe Distinct <$> duplicates)
> <*> option Respectively > <*> option Respectively
> (Corresponding <$ keyword_ "corresponding") > (Corresponding <$ keyword_ "corresponding")
@ -797,7 +821,7 @@ whitespace parser which skips comments also
> choice [simpleWhitespace *> whitespace > choice [simpleWhitespace *> whitespace
> ,lineComment *> whitespace > ,lineComment *> whitespace
> ,blockComment *> whitespace > ,blockComment *> whitespace
> ,return ()] > ,return ()] <?> "whitespace"
> where > where
> lineComment = try (string "--") > lineComment = try (string "--")
> *> manyTill anyChar (void (char '\n') <|> eof) > *> manyTill anyChar (void (char '\n') <|> eof)
@ -813,7 +837,7 @@ whitespace parser which skips comments also
> lexeme p = p <* whitespace > lexeme p = p <* whitespace
> integer :: Parser Integer > integer :: Parser Integer
> integer = read <$> lexeme (many1 digit) > integer = read <$> lexeme (many1 digit) <?> "integer"
number literals number literals
@ -830,12 +854,14 @@ making a decision on how to represent numbers, the client code can
make this choice. make this choice.
> numberLiteral :: Parser String > numberLiteral :: Parser String
> numberLiteral = lexeme $ > numberLiteral = lexeme (
> choice [int > (choice [int
> >>= optionSuffix dot > >>= optionSuffix dot
> >>= optionSuffix fracts > >>= optionSuffix fracts
> >>= optionSuffix expon > >>= optionSuffix expon
> ,fract "" >>= optionSuffix expon] > ,fract "" >>= optionSuffix expon])
> <* notFollowedBy (alphaNum <|> char '.'))
> <?> "number literal"
> where > where
> int = many1 digit > int = many1 digit
> fract p = dot p >>= fracts > fract p = dot p >>= fracts
@ -850,25 +876,27 @@ make this choice.
> identifier :: Parser String > identifier :: Parser String
> identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar) > identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar)
> <?> "identifier"
> where > where
> firstChar = letter <|> char '_' > firstChar = letter <|> char '_'
> nonFirstChar = digit <|> firstChar > nonFirstChar = digit <|> firstChar
> quotedIdentifier :: Parser String > quotedIdentifier :: Parser String
> quotedIdentifier = char '"' *> manyTill anyChar doubleQuote > quotedIdentifier = char '"' *> manyTill anyChar doubleQuote
> <?> "identifier"
TODO: add "" inside quoted identifiers TODO: add "" inside quoted identifiers
todo: work out the symbol parsing better todo: work out the symbol parsing better
> symbol :: String -> Parser String > symbol :: String -> Parser String
> symbol s = try $ lexeme $ do > symbol s = try (lexeme $ do
> u <- choice > u <- choice (many1 (char '.') :
> [string "." > map (try . string) [">=","<=","!=","<>","||"]
> ,many1 (oneOf "<>=+-^%/*!|~&") > ++ map (string . (:[])) "+-^*/%~&|<>=")
> ]
> guard (s == u) > guard (s == u)
> return s > return s)
> <?> s
> questionMark :: Parser Char > questionMark :: Parser Char
> questionMark = lexeme $ char '?' > questionMark = lexeme $ char '?'
@ -896,6 +924,7 @@ todo: work out the symbol parsing better
> stringToken = > stringToken =
> lexeme (char '\'' *> manyTill anyChar (char '\'') > lexeme (char '\'' *> manyTill anyChar (char '\'')
> >>= optionSuffix moreString) > >>= optionSuffix moreString)
> <?> "string"
> where > where
> moreString s0 = try $ do > moreString s0 = try $ do
> void $ char '\'' > void $ char '\''
@ -905,10 +934,10 @@ todo: work out the symbol parsing better
= helper functions = helper functions
> keyword :: String -> Parser String > keyword :: String -> Parser String
> keyword k = try $ do > keyword k = try (do
> i <- identifier > i <- identifier
> guard (map toLower i == k) > guard (map toLower i == k)
> return k > return k) <?> k
> parens :: Parser a -> Parser a > parens :: Parser a -> Parser a
> parens = between openParen closeParen > parens = between openParen closeParen
@ -934,10 +963,11 @@ instead, and create an alternative suffix parser
> optionSuffix p a = option a (p a) > optionSuffix p a = option a (p a)
> identifierBlacklist :: [String] -> Parser String > identifierBlacklist :: [String] -> Parser String
> identifierBlacklist bl = try $ do > identifierBlacklist bl = try (do
> i <- identifier > i <- identifier
> guard (map toLower i `notElem` bl) > guard (map toLower i `notElem` bl)
> return i > return i)
> <?> "identifier"
> blacklist :: [String] > blacklist :: [String]
> blacklist = > blacklist =
@ -958,6 +988,10 @@ sure what other places strictly need the blacklist, and in theory it
could be tuned differently for each place the identifierString/ could be tuned differently for each place the identifierString/
identifier parsers are used to only blacklist the bare minimum. identifier parsers are used to only blacklist the bare minimum.
The standard has a weird mix of reserved keywords and unreserved
keywords (I'm not sure what exactly being an unreserved keyword
means).
-------------------------------------------- --------------------------------------------
= helper functions = helper functions

33
TODO
View file

@ -1,4 +1,5 @@
continue 2003 review and tests continue 2003 review and tests
docs: how to run the tests
touch up the expr hack as best as can touch up the expr hack as best as can
left factor as much as possible (see below on notes) left factor as much as possible (see below on notes)
table expression in syntax: table expression in syntax:
@ -14,11 +15,41 @@ look at fixing the expression parsing completely
represent natural and using/on in the syntax more close to the represent natural and using/on in the syntax more close to the
concrete syntax - don't combine in the ast concrete syntax - don't combine in the ast
review the token parsers, and make sure they have trailing delimiters
or consume bad trailing characters and fail (e.g. 1e2e3 in a select
list parses as '1e2 e3' i.e. '1e2 as e3'
split the general symbol and operator parsing, and make it tighter
in terms of when the symbol or operator ends (don't allow to end
early)
approach: review the lexical syntax, create complete list of
tokens/token generators. Divide into tokens which must be followed
by some particular other token or at least one whitespace, and ones
which can be immediately followed by another token. Then fix the
lexing parsers to work this way
whitespace/comments
integers
numbers
string literals
keywords
operator symbols <>=+=^%/*!|~&
non operator symbols ()?,;"'
identifiers
quoted identifiers
identifiers and keywords are ok for now
there are issues with integers, numbers, operators and non operator
symbols
review places in the parse which should allow only a fixed set of
identifiers (e.g. in interval literals)
decide whether to represent numeric literals better, instead of a decide whether to represent numeric literals better, instead of a
single string - break up into parts, or parse to a Decimal or single string - break up into parts, or parse to a Decimal or
something something
rough SQL 2003 todo, including tests to write: rough SQL 2003 todo, including tests to write:
can multipart identifiers have whitespace around the '.'?
multipart string literals multipart string literals
national, unicode, hex, bit string literals, escapes national, unicode, hex, bit string literals, escapes
string literal character sets string literal character sets
@ -92,7 +123,7 @@ additional stuff review:
interval stuff interval stuff
collate clause collate clause
aggregate functions: lots of missing bits aggregate functions: lots of missing bits
complete list of keywords/reserved keywords
review areas where this parser is too permissive, e.g. value review areas where this parser is too permissive, e.g. value

View file

@ -0,0 +1,149 @@
Want to work on the error messages. Ultimately, parsec won't give the
best error message for a parser combinator library in haskell. Should
check out the alternatives such as polyparse and uu-parsing.
For now the plan is to try to get the best out of parsec. Skip heavy
work on this until the parser is more left factored?
Ideas:
1. generate large lists of invalid syntax
2. create table of the sql source and the error message
3. save these tables and compare from version to version. Want to
catch improvements and regressions and investigate. Have to do this
manually
= generating bad sql source
take good sql statements or expressions. Convert them into sequences
of tokens - want to preserve the whitespace and comments perfectly
here. Then modify these lists by either adding a token, removing a
token, or modifying a token (including creating bad tokens of raw
strings which don't represent anything than can be tokenized.
Now can see the error message for all of these bad strings. Probably
have to generate and prune this list manually in stages since there
will be too many.
Contexts:
another area to focus on is contexts: for instance, we have a set of
e.g. 1000 bad scalar expressions with error messages. Now can put
those bad scalar expressions into various contexts and see that the
error messages are still good.
plan:
1. create a list of all the value expression, with some variations for
each
2. manually create some error variations for each expression
3. create a renderer which will create a csv of the expressions and
the errors
this is to load as a spreadsheet to investigate more
4. create a renderer for the csv which will create a markdown file for
the website. this is to demonstrate the error messages in the
documentation
Then create some contexts for all of these: inside another value
expression, or inside a query expression. Do the same: render and
review the error messages.
Then, create some query expressions to focus on the non value
expression parts.
> module Language.SQL.SimpleSQL.ErrorMessages where
> import Language.SQL.SimpleSQL.Parser
> import Data.List
> import Text.Groom
> valueExpressions :: [String]
> valueExpressions =
> ["10.."
> ,"..10"
> ,"10e1e2"
> ,"10e--3"
> ,"1a"
> ,"1%"
> ,"'b'ad'"
> ,"'bad"
> ,"bad'"
> ,"interval '5' ays"
> ,"interval '5' days (4.4)"
> ,"interval '5' days (a)"
> ,"intervala '5' days"
> ,"interval 'x' days (3"
> ,"interval 'x' days 3)"
> ,"1badiden"
> ,"$"
> ,"!"
> ,"*.a"
> ,"??"
> ,"3?"
> ,"?a"
> ,"row"
> ,"row 1,2"
> ,"row(1,2"
> ,"row 1,2)"
> ,"row(1 2)"
> ,"f("
> ,"f)"
> ,"f(a"
> ,"f a)"
> ,"f(a b)"
TODO:
case
operators
> ,"a + (b + c"
casts
subqueries: + whole set of parentheses use
in list
'keyword' functions
aggregates
window functions
> ]
> queryExpressions :: [String]
> queryExpressions =
> map sl1 valueExpressions
> ++ map sl2 valueExpressions
> ++ map sl3 valueExpressions
> ++
> ["select a from t inner jin u"]
> where
> sl1 x = "select " ++ x ++ " from t"
> sl2 x = "select " ++ x ++ ", y from t"
> sl3 x = "select " ++ x ++ " fom t"
> valExprs :: [String] -> [(String,String)]
> valExprs = map parseOne
> where
> parseOne x = let p = parseValueExpr "" Nothing x
> in (x,either peFormattedError (\x -> "ERROR: parsed ok " ++ groom x) p)
> queryExprs :: [String] -> [(String,String)]
> queryExprs = map parseOne
> where
> parseOne x = let p = parseQueryExpr "" Nothing x
> in (x,either peFormattedError (\x -> "ERROR: parsed ok " ++ groom x) p)
> pExprs :: [String] -> [String] -> String
> pExprs x y =
> let l = valExprs x ++ queryExprs y
> in intercalate "\n\n\n\n" $ map (\(a,b) -> a ++ "\n" ++ b) l

View file

@ -264,6 +264,8 @@ select page reference
> ,"SELECT 2+2;" > ,"SELECT 2+2;"
> ,"SELECT distributors.* WHERE distributors.name = 'Westward';" > -- simple-sql-parser doesn't support where without from
> -- this can be added for the postgres dialect when it is written
> --,"SELECT distributors.* WHERE distributors.name = 'Westward';"
> ] > ]

View file

@ -8,18 +8,46 @@ We are only interested in the query syntax, goes through sections 5-10
The goal is to create some coverage tests to get close to supporting a The goal is to create some coverage tests to get close to supporting a
large amount of the SQL. large amount of the SQL.
> module Language.SQL.SimpleSQL.SQL2003 where > module Language.SQL.SimpleSQL.SQL2003 (sql2003Tests) where
> import Language.SQL.SimpleSQL.TestTypes > import Language.SQL.SimpleSQL.TestTypes
> import Language.SQL.SimpleSQL.Syntax > import Language.SQL.SimpleSQL.Syntax
> sql2003Tests :: TestItem
> sql2003Tests = Group "sql2003Tests"
> [stringLiterals
> ,nationalCharacterStringLiterals
> ,unicodeStringLiterals
> ,binaryStringLiterals
> ,numericLiterals
> ,dateAndTimeLiterals
> ,booleanLiterals
> ,identifiers
> ,typeNames
> ,parenthesizedValueExpression
> ,targetSpecification
> ,contextuallyTypeValueSpec
> ,nextValueExpression
> ,arrayElementReference
> ,multisetElementReference
> ,numericValueExpression
> ,booleanValueExpression
> ,arrayValueConstructor
> ,tableValueConstructor
> ,fromClause
> ,whereClause
> ,groupbyClause
> ,querySpecification
> ,queryExpressions
> ,sortSpecificationList
> ]
= 5 Lexical Elements = 5 Lexical Elements
Basic definitions of characters used, tokens, symbols, etc. Most of this section would normally be handled within the lexical analyzer rather than in the grammar proper. Further, the original document does not quote the various single characters, which makes it hard to process automatically. Basic definitions of characters used, tokens, symbols, etc. Most of this section would normally be handled within the lexical analyzer rather than in the grammar proper. Further, the original document does not quote the various single characters, which makes it hard to process automatically.
[There seems to be a lot of unused stuff here, so skip this section and only do bits which [There seems to be a lot of unused stuff here, so skip this section
and only do bits which are needed by other bits]
5.1 <SQL terminal character> (p151) 5.1 <SQL terminal character> (p151)
@ -488,8 +516,8 @@ standards to include everything that was dropped also?
TODO: how to escapes work here? TODO: how to escapes work here?
> bitBinaryStringLiterals :: TestItem > binaryStringLiterals :: TestItem
> bitBinaryStringLiterals = Group "bit and hex string literals" $ map (uncurry TestValueExpr) > binaryStringLiterals = Group "bit and hex string literals" $ map (uncurry TestValueExpr)
> [("B'101010'", undefined) > [("B'101010'", undefined)
> ,("X'7f7f7f'", undefined) > ,("X'7f7f7f'", undefined)
> ] > ]
@ -1031,11 +1059,12 @@ TODO: review how the special keywords are parsed and add tests for these
> targetSpecification :: TestItem > targetSpecification :: TestItem
> targetSpecification = Group "target specification" $ map (uncurry TestValueExpr) > targetSpecification = Group "target specification" $ map (uncurry TestValueExpr)
> [(":hostparam", undefined) > [(":hostparam", undefined)
> ,(":hostparam indicator :another_host_param", undefined)
> ,("?", undefined) > ,("?", undefined)
> ,(":h[3]", undefined) > ,(":h[3]", undefined)
> ] > ]
TODO: modules stuff, indicators, not sure what current_collation is TODO: modules stuff, not sure what current_collation is
for or how it works for or how it works
@ -1849,7 +1878,7 @@ Specify a set of <row value expression>s to be constructed into a table.
<contextually typed row value expression list> ::= <contextually typed row value expression> [ { <comma> <contextually typed row value expression> }... ] <contextually typed row value expression list> ::= <contextually typed row value expression> [ { <comma> <contextually typed row value expression> }... ]
> tableValueConstructor :: TestItem > tableValueConstructor :: TestItem
> tableValueConstructor = Group "table value constructor" $ map (uncurry TestValueExpr) > tableValueConstructor = Group "table value constructor" $ map (uncurry TestQueryExpr)
> [("values (1,2), (a+b,(select count(*) from t));", undefined) > [("values (1,2), (a+b,(select count(*) from t));", undefined)
> ] > ]
@ -1869,7 +1898,7 @@ Specify a table or a grouped table.
TODO: expand on these tests and review uncovered grammar TODO: expand on these tests and review uncovered grammar
> fromClause :: TestItem > fromClause :: TestItem
> fromClause = Group "from clause" $ map (uncurry TestValueExpr) > fromClause = Group "from clause" $ map (uncurry TestQueryExpr)
> [("select * from t,u", undefined) > [("select * from t,u", undefined)
> ,("select * from t as a", undefined) > ,("select * from t as a", undefined)
@ -1990,7 +2019,7 @@ Specify a table derived by the application of a <search condition> to the result
<where clause> ::= WHERE <search condition> <where clause> ::= WHERE <search condition>
> whereClause :: TestItem > whereClause :: TestItem
> whereClause = Group "where clause" $ map (uncurry TestValueExpr) > whereClause = Group "where clause" $ map (uncurry TestQueryExpr)
> [("select * from t where a = 5", undefined)] > [("select * from t where a = 5", undefined)]
@ -2042,7 +2071,7 @@ It seems even in sql 2003, you can only put column references in the
groups, and not general value expressions. groups, and not general value expressions.
> groupbyClause :: TestItem > groupbyClause :: TestItem
> groupbyClause = Group "group by clause" $ map (uncurry TestValueExpr) > groupbyClause = Group "group by clause" $ map (uncurry TestQueryExpr)
> [("select a, sum(b) from t group by a", undefined) > [("select a, sum(b) from t group by a", undefined)
> ,("select a, c,sum(b) from t group by a,c", undefined) > ,("select a, c,sum(b) from t group by a,c", undefined)
> ,("select a, c,sum(b) from t group by a,c collate x", undefined) > ,("select a, c,sum(b) from t group by a,c collate x", undefined)
@ -2146,7 +2175,7 @@ TODO: review this and add more variants
> querySpecification :: TestItem > querySpecification :: TestItem
> querySpecification = Group "query specification" $ map (uncurry TestValueExpr) > querySpecification = Group "query specification" $ map (uncurry TestQueryExpr)
> [("select a from t", undefined) > [("select a from t", undefined)
> ,("select all a from t", undefined) > ,("select all a from t", undefined)
> ,("select distinct a from t", undefined) > ,("select distinct a from t", undefined)
@ -2206,7 +2235,7 @@ TODO: common table expressions
<corresponding column list> ::= <column name list> <corresponding column list> ::= <column name list>
> queryExpressions :: TestItem > queryExpressions :: TestItem
> queryExpressions = Group "query expressions" $ map (uncurry TestValueExpr) > queryExpressions = Group "query expressions" $ map (uncurry TestQueryExpr)
> [("select a from t union select a from u", undefined) > [("select a from t union select a from u", undefined)
> ,("select a from t union all select a from u", undefined) > ,("select a from t union all select a from u", undefined)
@ -2784,7 +2813,7 @@ Specify a sort order.
TODO: review sort specifications TODO: review sort specifications
> sortSpecificationList :: TestItem > sortSpecificationList :: TestItem
> sortSpecificationList = Group "sort specification list" $ map (uncurry TestValueExpr) > sortSpecificationList = Group "sort specification list" $ map (uncurry TestQueryExpr)
> [("select * from t order by a", undefined) > [("select * from t order by a", undefined)
> ,("select * from t order by a,b", undefined) > ,("select * from t order by a,b", undefined)
> ,("select * from t order by a asc,b", undefined) > ,("select * from t order by a asc,b", undefined)
@ -2794,3 +2823,7 @@ TODO: review sort specifications
> ] > ]
TODO: what happened to the collation in order by? TODO: what happened to the collation in order by?
Answer: sort used to be a column reference with an optional
collate. Since it is now a value expression, the collate doesn't need
to be mentioned here.

View file

@ -28,7 +28,7 @@ test data to the Test.Framework tests.
> import Language.SQL.SimpleSQL.ValueExprs > import Language.SQL.SimpleSQL.ValueExprs
> import Language.SQL.SimpleSQL.Tpch > import Language.SQL.SimpleSQL.Tpch
> import Language.SQL.SimpleSQL.SQL2003
Order the tests to start from the simplest first. This is also the Order the tests to start from the simplest first. This is also the
order on the generated documentation. order on the generated documentation.
@ -44,6 +44,7 @@ order on the generated documentation.
> ,fullQueriesTests > ,fullQueriesTests
> ,postgresTests > ,postgresTests
> ,tpchTests > ,tpchTests
> ,sql2003Tests
> ] > ]
> tests :: Test.Framework.Test > tests :: Test.Framework.Test

7
tools/ShowErrors.lhs Normal file
View file

@ -0,0 +1,7 @@
> import Language.SQL.SimpleSQL.ErrorMessages
> main :: IO ()
> main = putStrLn $ pExprs valueExpressions queryExpressions