add some big improvements to parse error messages

change the parser to not attempt to parse the elements following 'from' unless there is a actual 'from' improve the symbol parser to try to deal with issues when symbols are next to eachother with no intervening whitespaces improve number literal parsing to fail if there are trailing letters or digits which aren't part of the number and aren't separated with whitespace add some code to start analysing the quality of parse error messages
2014-04-17 18:32:41 +03:00 · 2014-04-17 18:32:41 +03:00 · 488310ff6a
parent c48b057457
commit 488310ff6a
7 changed files with 298 additions and 41 deletions
--- a/Language/SQL/SimpleSQL/Parser.lhs
+++ b/Language/SQL/SimpleSQL/Parser.lhs
@ -17,7 +17,7 @@
 >                    ,option,between,sepBy,sepBy1,string,manyTill,anyChar
 >                    ,try,string,many1,oneOf,digit,(<|>),choice,char,eof
 >                    ,optionMaybe,optional,many,letter,parse
->                    ,chainl1)
+>                    ,chainl1, (<?>),notFollowedBy,alphaNum)
 > import Text.Parsec.String (Parser)
 > import qualified Text.Parsec as P (ParseError)
 > import Text.Parsec.Perm (permute,(<$?>), (<|?>))
@ -580,6 +580,7 @@ fragile and could at least do with some heavy explanation.
 >               ,star
 >               ,iden
 >               ,parensValue]
 >        <?> "value expression"
 expose the b expression for window frame clause range between
@ -745,25 +746,48 @@ and union, etc..
 >         mkSelect
 >         <$> (fromMaybe All <$> duplicates)
 >         <*> selectList
->         <*> option [] from
+>         <*> optionMaybe tableExpression
->         <*> optionMaybe whereClause
+>     mkSelect d sl Nothing =
->         <*> option [] groupByClause
+>         makeSelect{qeSetQuantifier = d, qeSelectList = sl}
->         <*> optionMaybe having
+>     mkSelect d sl (Just (TableExpression f w g h od ofs fe)) =
 >         <*> option [] orderBy
 >         <*> offsetFetch
 >     mkSelect d sl f w g h od (ofs,fe) =
 >         Select d sl f w g h od ofs fe
 >     values = keyword_ "values"
 >              >> Values <$> commaSep (parens (commaSep valueExpr))
 >     table = keyword_ "table" >> Table <$> name
 local data type to help with parsing the bit after the select list,
 called 'table expression' in the ansi sql grammar. Maybe this should
 be in the public syntax?
 > data TableExpression
 >     = TableExpression
 >       {_teFrom :: [TableRef]
 >       ,_teWhere :: Maybe ValueExpr
 >       ,_teGroupBy :: [GroupingExpr]
 >       ,_teHaving :: Maybe ValueExpr
 >       ,_teOrderBy :: [SortSpec]
 >       ,_teOffset :: Maybe ValueExpr
 >       ,_teFetchFirst :: Maybe ValueExpr}
 > tableExpression :: Parser TableExpression
 > tableExpression =
 >    mkTe <$> from
 >         <*> optionMaybe whereClause
 >         <*> option [] groupByClause
 >         <*> optionMaybe having
 >         <*> option [] orderBy
 >         <*> offsetFetch
 >  where
 >     mkTe f w g h od (ofs,fe) =
 >         TableExpression f w g h od ofs fe
 > queryExprSuffix :: QueryExpr -> Parser QueryExpr
 > queryExprSuffix qe =
 >     (CombineQueryExpr qe
->      <$> choice
+>      <$> (choice
 >          [Union <$ keyword_ "union"
 >          ,Intersect <$ keyword_ "intersect"
->          ,Except <$ keyword_ "except"]
+>          ,Except <$ keyword_ "except"] <?> "set operator")
 >      <*> (fromMaybe Distinct <$> duplicates)
 >      <*> option Respectively
 >                 (Corresponding <$ keyword_ "corresponding")
@ -797,7 +821,7 @@ whitespace parser which skips comments also
 >     choice [simpleWhitespace *> whitespace
 >            ,lineComment *> whitespace
 >            ,blockComment *> whitespace
->            ,return ()]
+>            ,return ()] <?> "whitespace"
 >   where
 >     lineComment = try (string "--")
 >                   *> manyTill anyChar (void (char '\n') <|> eof)
@ -813,7 +837,7 @@ whitespace parser which skips comments also
 > lexeme p = p <* whitespace
 > integer :: Parser Integer
-> integer = read <$> lexeme (many1 digit)
+> integer = read <$> lexeme (many1 digit) <?> "integer"
 number literals
@ -830,12 +854,14 @@ making a decision on how to represent numbers, the client code can
 make this choice.
 > numberLiteral :: Parser String
-> numberLiteral = lexeme $
+> numberLiteral = lexeme (
->     choice [int
+>     (choice [int
 >             >>= optionSuffix dot
 >             >>= optionSuffix fracts
 >             >>= optionSuffix expon
->            ,fract "" >>= optionSuffix expon]
+>            ,fract "" >>= optionSuffix expon])
 >     <* notFollowedBy (alphaNum <|> char '.'))
 >     <?> "number literal"
 >   where
 >     int = many1 digit
 >     fract p = dot p >>= fracts
@ -850,25 +876,27 @@ make this choice.
 > identifier :: Parser String
 > identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar)
 >              <?> "identifier"
 >   where
 >     firstChar = letter <|> char '_'
 >     nonFirstChar = digit <|> firstChar
 > quotedIdentifier :: Parser String
 > quotedIdentifier = char '"' *> manyTill anyChar doubleQuote
 >                    <?> "identifier"
 TODO: add "" inside quoted identifiers
 todo: work out the symbol parsing better
 > symbol :: String -> Parser String
-> symbol s = try $ lexeme $ do
+> symbol s = try (lexeme $ do
->     u <- choice
+>     u <- choice (many1 (char '.') :
->          [string "."
+>                  map (try . string) [">=","<=","!=","<>","||"]
->          ,many1 (oneOf "<>=+-^%/*!|~&")
+>                  ++ map (string . (:[])) "+-^*/%~&|<>=")
 >          ]
 >     guard (s == u)
->     return s
+>     return s)
 >     <?> s
 > questionMark :: Parser Char
 > questionMark = lexeme $ char '?'
@ -896,6 +924,7 @@ todo: work out the symbol parsing better
 > stringToken =
 >     lexeme (char '\'' *> manyTill anyChar (char '\'')
 >     >>= optionSuffix moreString)
 >     <?> "string"
 >   where
 >     moreString s0 = try $ do
 >         void $ char '\''
@ -905,10 +934,10 @@ todo: work out the symbol parsing better
 = helper functions
 > keyword :: String -> Parser String
-> keyword k = try $ do
+> keyword k = try (do
 >     i <- identifier
 >     guard (map toLower i == k)
->     return k
+>     return k) <?> k
 > parens :: Parser a -> Parser a
 > parens = between openParen closeParen
@ -934,10 +963,11 @@ instead, and create an alternative suffix parser
 > optionSuffix p a = option a (p a)
 > identifierBlacklist :: [String] -> Parser String
-> identifierBlacklist bl = try $ do
+> identifierBlacklist bl = try (do
 >     i <- identifier
 >     guard (map toLower i `notElem` bl)
->     return i
+>     return i)
 >     <?> "identifier"
 > blacklist :: [String]
 > blacklist =
@ -958,6 +988,10 @@ sure what other places strictly need the blacklist, and in theory it
 could be tuned differently for each place the identifierString/
 identifier parsers are used to only blacklist the bare minimum.
 The standard has a weird mix of reserved keywords and unreserved
 keywords (I'm not sure what exactly being an unreserved keyword
 means).
 --------------------------------------------
 = helper functions
--- a/33
+++ b/33
@ -1,4 +1,5 @@
 continue 2003 review and tests
 docs: how to run the tests
 touch up the expr hack as best as can
 left factor as much as possible (see below on notes)
 table expression in syntax:
@ -14,11 +15,41 @@ look at fixing the expression parsing completely
 represent natural and using/on in the syntax more close to the
   concrete syntax - don't combine in the ast
 review the token parsers, and make sure they have trailing delimiters
   or consume bad trailing characters and fail (e.g. 1e2e3 in a select
   list parses as '1e2 e3' i.e. '1e2 as e3'
   split the general symbol and operator parsing, and make it tighter
   in terms of when the symbol or operator ends (don't allow to end
   early)
 approach: review the lexical syntax, create complete list of
   tokens/token generators. Divide into tokens which must be followed
   by some particular other token or at least one whitespace, and ones
   which can be immediately followed by another token. Then fix the
   lexing parsers to work this way
 whitespace/comments
 integers
 numbers
 string literals
 keywords
 operator symbols <>=+=^%/*!|~&
 non operator symbols ()?,;"'
 identifiers
 quoted identifiers
 identifiers and keywords are ok for now
 there are issues with integers, numbers, operators and non operator
   symbols
 review places in the parse which should allow only a fixed set of
   identifiers (e.g. in interval literals)
 decide whether to represent numeric literals better, instead of a
   single string - break up into parts, or parse to a Decimal or
   something
 rough SQL 2003 todo, including tests to write:
 can multipart identifiers have whitespace around the '.'?
 multipart string literals
 national, unicode, hex, bit string literals, escapes
 string literal character sets
@ -92,7 +123,7 @@ additional stuff review:
 interval stuff
 collate clause
 aggregate functions: lots of missing bits
-
+complete list of keywords/reserved keywords
 review areas where this parser is too permissive, e.g. value
--- a/tools/Language/SQL/SimpleSQL/ErrorMessages.lhs
+++ b/tools/Language/SQL/SimpleSQL/ErrorMessages.lhs
@ -0,0 +1,149 @@
 Want to work on the error messages. Ultimately, parsec won't give the
 best error message for a parser combinator library in haskell. Should
 check out the alternatives such as polyparse and uu-parsing.
 For now the plan is to try to get the best out of parsec. Skip heavy
 work on this until the parser is more left factored?
 Ideas:
 1. generate large lists of invalid syntax
 2. create table of the sql source and the error message
 3. save these tables and compare from version to version. Want to
   catch improvements and regressions and investigate. Have to do this
   manually
 = generating bad sql source
 take good sql statements or expressions. Convert them into sequences
 of tokens - want to preserve the whitespace and comments perfectly
 here. Then modify these lists by either adding a token, removing a
 token, or modifying a token (including creating bad tokens of raw
 strings which don't represent anything than can be tokenized.
 Now can see the error message for all of these bad strings. Probably
 have to generate and prune this list manually in stages since there
 will be too many.
 Contexts:
 another area to focus on is contexts: for instance, we have a set of
 e.g. 1000 bad scalar expressions with error messages. Now can put
 those bad scalar expressions into various contexts and see that the
 error messages are still good.
 plan:
 1. create a list of all the value expression, with some variations for
   each
 2. manually create some error variations for each expression
 3. create a renderer which will create a csv of the expressions and
   the errors
   this is to load as a spreadsheet to investigate more
 4. create a renderer for the csv which will create a markdown file for
   the website. this is to demonstrate the error messages in the
   documentation
 Then create some contexts for all of these: inside another value
 expression, or inside a query expression. Do the same: render and
 review the error messages.
 Then, create some query expressions to focus on the non value
 expression parts.
 > module Language.SQL.SimpleSQL.ErrorMessages where
 > import Language.SQL.SimpleSQL.Parser
 > import Data.List
 > import Text.Groom
 > valueExpressions :: [String]
 > valueExpressions =
 >     ["10.."
 >     ,"..10"
 >     ,"10e1e2"
 >     ,"10e--3"
 >     ,"1a"
 >     ,"1%"
 >     ,"'b'ad'"
 >     ,"'bad"
 >     ,"bad'"
 >     ,"interval '5' ays"
 >     ,"interval '5' days (4.4)"
 >     ,"interval '5' days (a)"
 >     ,"intervala '5' days"
 >     ,"interval 'x' days (3"
 >     ,"interval 'x' days 3)"
 >     ,"1badiden"
 >     ,"$"
 >     ,"!"
 >     ,"*.a"
 >     ,"??"
 >     ,"3?"
 >     ,"?a"
 >     ,"row"
 >     ,"row 1,2"
 >     ,"row(1,2"
 >     ,"row 1,2)"
 >     ,"row(1 2)"
 >     ,"f("
 >     ,"f)"
 >     ,"f(a"
 >     ,"f a)"
 >     ,"f(a b)"
 TODO:
 case
 operators
 >    ,"a + (b + c"
 casts
 subqueries: + whole set of parentheses use
 in list
 'keyword' functions
 aggregates
 window functions
 >    ]
 > queryExpressions :: [String]
 > queryExpressions =
 >     map sl1 valueExpressions
 >     ++ map sl2 valueExpressions
 >     ++ map sl3 valueExpressions
 >     ++
 >     ["select a from t inner jin u"]
 >   where
 >     sl1 x = "select " ++ x ++ " from t"
 >     sl2 x = "select " ++ x ++ ", y from t"
 >     sl3 x = "select " ++ x ++ " fom t"
 > valExprs :: [String] -> [(String,String)]
 > valExprs = map parseOne
 >   where
 >     parseOne x = let p = parseValueExpr "" Nothing x
 >                  in (x,either peFormattedError (\x -> "ERROR: parsed ok " ++ groom x) p)
 > queryExprs :: [String] -> [(String,String)]
 > queryExprs = map parseOne
 >   where
 >     parseOne x = let p = parseQueryExpr "" Nothing x
 >                  in (x,either peFormattedError (\x -> "ERROR: parsed ok " ++ groom x) p)
 > pExprs :: [String] -> [String] -> String
 > pExprs x y =
 >     let l = valExprs x ++ queryExprs y
 >     in intercalate "\n\n\n\n" $ map (\(a,b) -> a ++ "\n" ++ b) l
--- a/tools/Language/SQL/SimpleSQL/Postgres.lhs
+++ b/tools/Language/SQL/SimpleSQL/Postgres.lhs
@ -264,6 +264,8 @@ select page reference
 >     ,"SELECT 2+2;"
->     ,"SELECT distributors.* WHERE distributors.name = 'Westward';"
+>     -- simple-sql-parser doesn't support where without from
 >     -- this can be added for the postgres dialect when it is written
 >     --,"SELECT distributors.* WHERE distributors.name = 'Westward';"
 >     ]
--- a/tools/Language/SQL/SimpleSQL/SQL2003.lhs
+++ b/tools/Language/SQL/SimpleSQL/SQL2003.lhs
@ -8,18 +8,46 @@ We are only interested in the query syntax, goes through sections 5-10
 The goal is to create some coverage tests to get close to supporting a
 large amount of the SQL.
-> module Language.SQL.SimpleSQL.SQL2003 where
+> module Language.SQL.SimpleSQL.SQL2003 (sql2003Tests) where
 > import Language.SQL.SimpleSQL.TestTypes
 > import Language.SQL.SimpleSQL.Syntax
-
+> sql2003Tests :: TestItem
 > sql2003Tests = Group "sql2003Tests"
 >     [stringLiterals
 >     ,nationalCharacterStringLiterals
 >     ,unicodeStringLiterals
 >     ,binaryStringLiterals
 >     ,numericLiterals
 >     ,dateAndTimeLiterals
 >     ,booleanLiterals
 >     ,identifiers
 >     ,typeNames
 >     ,parenthesizedValueExpression
 >     ,targetSpecification
 >     ,contextuallyTypeValueSpec
 >     ,nextValueExpression
 >     ,arrayElementReference
 >     ,multisetElementReference
 >     ,numericValueExpression
 >     ,booleanValueExpression
 >     ,arrayValueConstructor
 >     ,tableValueConstructor
 >     ,fromClause
 >     ,whereClause
 >     ,groupbyClause
 >     ,querySpecification
 >     ,queryExpressions
 >     ,sortSpecificationList
 >     ]
 = 5 Lexical Elements
 Basic definitions of characters used, tokens, symbols, etc. Most of this section would normally be handled within the lexical analyzer rather than in the grammar proper. Further, the original document does not quote the various single characters, which makes it hard to process automatically.
-[There seems to be a lot of unused stuff here, so skip this section and only do bits which 
+[There seems to be a lot of unused stuff here, so skip this section
 and only do bits which are needed by other bits]
 5.1 <SQL terminal character> (p151)
@ -488,8 +516,8 @@ standards to include everything that was dropped also?
 TODO: how to escapes work here?
-> bitBinaryStringLiterals :: TestItem
+> binaryStringLiterals :: TestItem
-> bitBinaryStringLiterals = Group "bit and hex string literals" $ map (uncurry TestValueExpr)
+> binaryStringLiterals = Group "bit and hex string literals" $ map (uncurry TestValueExpr)
 >     [("B'101010'", undefined)
 >     ,("X'7f7f7f'", undefined)
 >     ]
@ -1031,11 +1059,12 @@ TODO: review how the special keywords are parsed and add tests for these
 > targetSpecification :: TestItem
 > targetSpecification = Group "target specification" $ map (uncurry TestValueExpr)
 >     [(":hostparam", undefined)
 >     ,(":hostparam indicator :another_host_param", undefined)
 >     ,("?", undefined)
 >     ,(":h[3]", undefined)
 >     ]
-TODO: modules stuff, indicators, not sure what current_collation is
+TODO: modules stuff, not sure what current_collation is
 for or how it works
@ -1849,7 +1878,7 @@ Specify a set of <row value expression>s to be constructed into a table.
 <contextually typed row value expression list>    ::=   <contextually typed row value expression> [ { <comma> <contextually typed row value expression> }... ]
 > tableValueConstructor :: TestItem
-> tableValueConstructor = Group "table value constructor" $ map (uncurry TestValueExpr)
+> tableValueConstructor = Group "table value constructor" $ map (uncurry TestQueryExpr)
 >     [("values (1,2), (a+b,(select count(*) from t));", undefined)
 >     ]
@ -1869,7 +1898,7 @@ Specify a table or a grouped table.
 TODO: expand on these tests and review uncovered grammar
 > fromClause :: TestItem
-> fromClause = Group "from clause" $ map (uncurry TestValueExpr)
+> fromClause = Group "from clause" $ map (uncurry TestQueryExpr)
 >     [("select * from t,u", undefined)
 >     ,("select * from t as a", undefined)
@ -1990,7 +2019,7 @@ Specify a table derived by the application of a <search condition> to the result
 <where clause>    ::=   WHERE <search condition>
 > whereClause :: TestItem
-> whereClause = Group "where clause" $ map (uncurry TestValueExpr)
+> whereClause = Group "where clause" $ map (uncurry TestQueryExpr)
 >     [("select * from t where a = 5", undefined)]
@ -2042,7 +2071,7 @@ It seems even in sql 2003, you can only put column references in the
 groups, and not general value expressions.
 > groupbyClause :: TestItem
-> groupbyClause = Group "group by clause" $ map (uncurry TestValueExpr)
+> groupbyClause = Group "group by clause" $ map (uncurry TestQueryExpr)
 >     [("select a, sum(b) from t group by a", undefined)
 >     ,("select a, c,sum(b) from t group by a,c", undefined)
 >     ,("select a, c,sum(b) from t group by a,c collate x", undefined)
@ -2146,7 +2175,7 @@ TODO: review this and add more variants
 > querySpecification :: TestItem
-> querySpecification = Group "query specification" $ map (uncurry TestValueExpr)
+> querySpecification = Group "query specification" $ map (uncurry TestQueryExpr)
 >     [("select a from t", undefined)
 >     ,("select all a from t", undefined)
 >     ,("select distinct a from t", undefined)
@ -2206,7 +2235,7 @@ TODO: common table expressions
 <corresponding column list>    ::=   <column name list>
 > queryExpressions :: TestItem
-> queryExpressions = Group "query expressions" $ map (uncurry TestValueExpr)
+> queryExpressions = Group "query expressions" $ map (uncurry TestQueryExpr)
 >     [("select a from t union select a from u", undefined)
 >     ,("select a from t union all select a from u", undefined)
@ -2784,7 +2813,7 @@ Specify a sort order.
 TODO: review sort specifications
 > sortSpecificationList :: TestItem
-> sortSpecificationList = Group "sort specification list" $ map (uncurry TestValueExpr)
+> sortSpecificationList = Group "sort specification list" $ map (uncurry TestQueryExpr)
 >     [("select * from t order by a", undefined)
 >     ,("select * from t order by a,b", undefined)
 >     ,("select * from t order by a asc,b", undefined)
@ -2794,3 +2823,7 @@ TODO: review sort specifications
 >     ]
 TODO: what happened to the collation in order by?
 Answer: sort used to be a column reference with an optional
 collate. Since it is now a value expression, the collate doesn't need
 to be mentioned here.
--- a/tools/Language/SQL/SimpleSQL/Tests.lhs
+++ b/tools/Language/SQL/SimpleSQL/Tests.lhs
@ -28,7 +28,7 @@ test data to the Test.Framework tests.
 > import Language.SQL.SimpleSQL.ValueExprs
 > import Language.SQL.SimpleSQL.Tpch
-
+> import Language.SQL.SimpleSQL.SQL2003
 Order the tests to start from the simplest first. This is also the
 order on the generated documentation.
@ -44,6 +44,7 @@ order on the generated documentation.
 >     ,fullQueriesTests
 >     ,postgresTests
 >     ,tpchTests
 >     ,sql2003Tests
 >     ]
 > tests :: Test.Framework.Test
--- a/tools/ShowErrors.lhs
+++ b/tools/ShowErrors.lhs
@ -0,0 +1,7 @@
 > import Language.SQL.SimpleSQL.ErrorMessages
 > main :: IO ()
 > main = putStrLn $ pExprs valueExpressions queryExpressions