From 488310ff6a7e87406fde580a12f6d4d3407f7ae8 Mon Sep 17 00:00:00 2001 From: Jake Wheat Date: Thu, 17 Apr 2014 18:32:41 +0300 Subject: [PATCH] add some big improvements to parse error messages change the parser to not attempt to parse the elements following 'from' unless there is a actual 'from' improve the symbol parser to try to deal with issues when symbols are next to eachother with no intervening whitespaces improve number literal parsing to fail if there are trailing letters or digits which aren't part of the number and aren't separated with whitespace add some code to start analysing the quality of parse error messages --- Language/SQL/SimpleSQL/Parser.lhs | 84 +++++++--- TODO | 33 +++- .../Language/SQL/SimpleSQL/ErrorMessages.lhs | 149 ++++++++++++++++++ tools/Language/SQL/SimpleSQL/Postgres.lhs | 4 +- tools/Language/SQL/SimpleSQL/SQL2003.lhs | 59 +++++-- tools/Language/SQL/SimpleSQL/Tests.lhs | 3 +- tools/ShowErrors.lhs | 7 + 7 files changed, 298 insertions(+), 41 deletions(-) create mode 100644 tools/Language/SQL/SimpleSQL/ErrorMessages.lhs create mode 100644 tools/ShowErrors.lhs diff --git a/Language/SQL/SimpleSQL/Parser.lhs b/Language/SQL/SimpleSQL/Parser.lhs index 1392308..d9af483 100644 --- a/Language/SQL/SimpleSQL/Parser.lhs +++ b/Language/SQL/SimpleSQL/Parser.lhs @@ -17,7 +17,7 @@ > ,option,between,sepBy,sepBy1,string,manyTill,anyChar > ,try,string,many1,oneOf,digit,(<|>),choice,char,eof > ,optionMaybe,optional,many,letter,parse -> ,chainl1) +> ,chainl1, (),notFollowedBy,alphaNum) > import Text.Parsec.String (Parser) > import qualified Text.Parsec as P (ParseError) > import Text.Parsec.Perm (permute,(<$?>), (<|?>)) @@ -580,6 +580,7 @@ fragile and could at least do with some heavy explanation. > ,star > ,iden > ,parensValue] +> "value expression" expose the b expression for window frame clause range between @@ -745,25 +746,48 @@ and union, etc.. > mkSelect > <$> (fromMaybe All <$> duplicates) > <*> selectList -> <*> option [] from -> <*> optionMaybe whereClause -> <*> option [] groupByClause -> <*> optionMaybe having -> <*> option [] orderBy -> <*> offsetFetch -> mkSelect d sl f w g h od (ofs,fe) = +> <*> optionMaybe tableExpression +> mkSelect d sl Nothing = +> makeSelect{qeSetQuantifier = d, qeSelectList = sl} +> mkSelect d sl (Just (TableExpression f w g h od ofs fe)) = > Select d sl f w g h od ofs fe > values = keyword_ "values" > >> Values <$> commaSep (parens (commaSep valueExpr)) > table = keyword_ "table" >> Table <$> name +local data type to help with parsing the bit after the select list, +called 'table expression' in the ansi sql grammar. Maybe this should +be in the public syntax? + +> data TableExpression +> = TableExpression +> {_teFrom :: [TableRef] +> ,_teWhere :: Maybe ValueExpr +> ,_teGroupBy :: [GroupingExpr] +> ,_teHaving :: Maybe ValueExpr +> ,_teOrderBy :: [SortSpec] +> ,_teOffset :: Maybe ValueExpr +> ,_teFetchFirst :: Maybe ValueExpr} + +> tableExpression :: Parser TableExpression +> tableExpression = +> mkTe <$> from +> <*> optionMaybe whereClause +> <*> option [] groupByClause +> <*> optionMaybe having +> <*> option [] orderBy +> <*> offsetFetch +> where +> mkTe f w g h od (ofs,fe) = +> TableExpression f w g h od ofs fe + > queryExprSuffix :: QueryExpr -> Parser QueryExpr > queryExprSuffix qe = > (CombineQueryExpr qe -> <$> choice +> <$> (choice > [Union <$ keyword_ "union" > ,Intersect <$ keyword_ "intersect" -> ,Except <$ keyword_ "except"] +> ,Except <$ keyword_ "except"] "set operator") > <*> (fromMaybe Distinct <$> duplicates) > <*> option Respectively > (Corresponding <$ keyword_ "corresponding") @@ -797,7 +821,7 @@ whitespace parser which skips comments also > choice [simpleWhitespace *> whitespace > ,lineComment *> whitespace > ,blockComment *> whitespace -> ,return ()] +> ,return ()] "whitespace" > where > lineComment = try (string "--") > *> manyTill anyChar (void (char '\n') <|> eof) @@ -813,7 +837,7 @@ whitespace parser which skips comments also > lexeme p = p <* whitespace > integer :: Parser Integer -> integer = read <$> lexeme (many1 digit) +> integer = read <$> lexeme (many1 digit) "integer" number literals @@ -830,12 +854,14 @@ making a decision on how to represent numbers, the client code can make this choice. > numberLiteral :: Parser String -> numberLiteral = lexeme $ -> choice [int +> numberLiteral = lexeme ( +> (choice [int > >>= optionSuffix dot > >>= optionSuffix fracts > >>= optionSuffix expon -> ,fract "" >>= optionSuffix expon] +> ,fract "" >>= optionSuffix expon]) +> <* notFollowedBy (alphaNum <|> char '.')) +> "number literal" > where > int = many1 digit > fract p = dot p >>= fracts @@ -850,25 +876,27 @@ make this choice. > identifier :: Parser String > identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar) +> "identifier" > where > firstChar = letter <|> char '_' > nonFirstChar = digit <|> firstChar > quotedIdentifier :: Parser String > quotedIdentifier = char '"' *> manyTill anyChar doubleQuote +> "identifier" TODO: add "" inside quoted identifiers todo: work out the symbol parsing better > symbol :: String -> Parser String -> symbol s = try $ lexeme $ do -> u <- choice -> [string "." -> ,many1 (oneOf "<>=+-^%/*!|~&") -> ] +> symbol s = try (lexeme $ do +> u <- choice (many1 (char '.') : +> map (try . string) [">=","<=","!=","<>","||"] +> ++ map (string . (:[])) "+-^*/%~&|<>=") > guard (s == u) -> return s +> return s) +> s > questionMark :: Parser Char > questionMark = lexeme $ char '?' @@ -896,6 +924,7 @@ todo: work out the symbol parsing better > stringToken = > lexeme (char '\'' *> manyTill anyChar (char '\'') > >>= optionSuffix moreString) +> "string" > where > moreString s0 = try $ do > void $ char '\'' @@ -905,10 +934,10 @@ todo: work out the symbol parsing better = helper functions > keyword :: String -> Parser String -> keyword k = try $ do +> keyword k = try (do > i <- identifier > guard (map toLower i == k) -> return k +> return k) k > parens :: Parser a -> Parser a > parens = between openParen closeParen @@ -934,10 +963,11 @@ instead, and create an alternative suffix parser > optionSuffix p a = option a (p a) > identifierBlacklist :: [String] -> Parser String -> identifierBlacklist bl = try $ do +> identifierBlacklist bl = try (do > i <- identifier > guard (map toLower i `notElem` bl) -> return i +> return i) +> "identifier" > blacklist :: [String] > blacklist = @@ -958,6 +988,10 @@ sure what other places strictly need the blacklist, and in theory it could be tuned differently for each place the identifierString/ identifier parsers are used to only blacklist the bare minimum. +The standard has a weird mix of reserved keywords and unreserved +keywords (I'm not sure what exactly being an unreserved keyword +means). + -------------------------------------------- = helper functions diff --git a/TODO b/TODO index 7ed30f0..d158357 100644 --- a/TODO +++ b/TODO @@ -1,4 +1,5 @@ continue 2003 review and tests +docs: how to run the tests touch up the expr hack as best as can left factor as much as possible (see below on notes) table expression in syntax: @@ -14,11 +15,41 @@ look at fixing the expression parsing completely represent natural and using/on in the syntax more close to the concrete syntax - don't combine in the ast +review the token parsers, and make sure they have trailing delimiters + or consume bad trailing characters and fail (e.g. 1e2e3 in a select + list parses as '1e2 e3' i.e. '1e2 as e3' + split the general symbol and operator parsing, and make it tighter + in terms of when the symbol or operator ends (don't allow to end + early) +approach: review the lexical syntax, create complete list of + tokens/token generators. Divide into tokens which must be followed + by some particular other token or at least one whitespace, and ones + which can be immediately followed by another token. Then fix the + lexing parsers to work this way +whitespace/comments +integers +numbers +string literals +keywords +operator symbols <>=+=^%/*!|~& +non operator symbols ()?,;"' +identifiers +quoted identifiers + +identifiers and keywords are ok for now +there are issues with integers, numbers, operators and non operator + symbols + + +review places in the parse which should allow only a fixed set of + identifiers (e.g. in interval literals) + decide whether to represent numeric literals better, instead of a single string - break up into parts, or parse to a Decimal or something rough SQL 2003 todo, including tests to write: +can multipart identifiers have whitespace around the '.'? multipart string literals national, unicode, hex, bit string literals, escapes string literal character sets @@ -92,7 +123,7 @@ additional stuff review: interval stuff collate clause aggregate functions: lots of missing bits - +complete list of keywords/reserved keywords review areas where this parser is too permissive, e.g. value diff --git a/tools/Language/SQL/SimpleSQL/ErrorMessages.lhs b/tools/Language/SQL/SimpleSQL/ErrorMessages.lhs new file mode 100644 index 0000000..1f6c71c --- /dev/null +++ b/tools/Language/SQL/SimpleSQL/ErrorMessages.lhs @@ -0,0 +1,149 @@ + +Want to work on the error messages. Ultimately, parsec won't give the +best error message for a parser combinator library in haskell. Should +check out the alternatives such as polyparse and uu-parsing. + +For now the plan is to try to get the best out of parsec. Skip heavy +work on this until the parser is more left factored? + +Ideas: + +1. generate large lists of invalid syntax +2. create table of the sql source and the error message +3. save these tables and compare from version to version. Want to + catch improvements and regressions and investigate. Have to do this + manually + += generating bad sql source + +take good sql statements or expressions. Convert them into sequences +of tokens - want to preserve the whitespace and comments perfectly +here. Then modify these lists by either adding a token, removing a +token, or modifying a token (including creating bad tokens of raw +strings which don't represent anything than can be tokenized. + +Now can see the error message for all of these bad strings. Probably +have to generate and prune this list manually in stages since there +will be too many. + +Contexts: + +another area to focus on is contexts: for instance, we have a set of +e.g. 1000 bad scalar expressions with error messages. Now can put +those bad scalar expressions into various contexts and see that the +error messages are still good. + +plan: + +1. create a list of all the value expression, with some variations for + each +2. manually create some error variations for each expression +3. create a renderer which will create a csv of the expressions and + the errors + this is to load as a spreadsheet to investigate more +4. create a renderer for the csv which will create a markdown file for + the website. this is to demonstrate the error messages in the + documentation + +Then create some contexts for all of these: inside another value +expression, or inside a query expression. Do the same: render and +review the error messages. + +Then, create some query expressions to focus on the non value +expression parts. + + +> module Language.SQL.SimpleSQL.ErrorMessages where + +> import Language.SQL.SimpleSQL.Parser +> import Data.List +> import Text.Groom + +> valueExpressions :: [String] +> valueExpressions = +> ["10.." +> ,"..10" +> ,"10e1e2" +> ,"10e--3" +> ,"1a" +> ,"1%" + +> ,"'b'ad'" +> ,"'bad" +> ,"bad'" + +> ,"interval '5' ays" +> ,"interval '5' days (4.4)" +> ,"interval '5' days (a)" +> ,"intervala '5' days" +> ,"interval 'x' days (3" +> ,"interval 'x' days 3)" + +> ,"1badiden" +> ,"$" +> ,"!" +> ,"*.a" + +> ,"??" +> ,"3?" +> ,"?a" + +> ,"row" +> ,"row 1,2" +> ,"row(1,2" +> ,"row 1,2)" +> ,"row(1 2)" + +> ,"f(" +> ,"f)" + +> ,"f(a" +> ,"f a)" +> ,"f(a b)" + +TODO: +case +operators + +> ,"a + (b + c" + +casts +subqueries: + whole set of parentheses use +in list +'keyword' functions +aggregates +window functions + + +> ] + +> queryExpressions :: [String] +> queryExpressions = +> map sl1 valueExpressions +> ++ map sl2 valueExpressions +> ++ map sl3 valueExpressions +> ++ +> ["select a from t inner jin u"] +> where +> sl1 x = "select " ++ x ++ " from t" +> sl2 x = "select " ++ x ++ ", y from t" +> sl3 x = "select " ++ x ++ " fom t" + +> valExprs :: [String] -> [(String,String)] +> valExprs = map parseOne +> where +> parseOne x = let p = parseValueExpr "" Nothing x +> in (x,either peFormattedError (\x -> "ERROR: parsed ok " ++ groom x) p) + + +> queryExprs :: [String] -> [(String,String)] +> queryExprs = map parseOne +> where +> parseOne x = let p = parseQueryExpr "" Nothing x +> in (x,either peFormattedError (\x -> "ERROR: parsed ok " ++ groom x) p) + + +> pExprs :: [String] -> [String] -> String +> pExprs x y = +> let l = valExprs x ++ queryExprs y +> in intercalate "\n\n\n\n" $ map (\(a,b) -> a ++ "\n" ++ b) l diff --git a/tools/Language/SQL/SimpleSQL/Postgres.lhs b/tools/Language/SQL/SimpleSQL/Postgres.lhs index 81a52d5..ae66ef2 100644 --- a/tools/Language/SQL/SimpleSQL/Postgres.lhs +++ b/tools/Language/SQL/SimpleSQL/Postgres.lhs @@ -264,6 +264,8 @@ select page reference > ,"SELECT 2+2;" -> ,"SELECT distributors.* WHERE distributors.name = 'Westward';" +> -- simple-sql-parser doesn't support where without from +> -- this can be added for the postgres dialect when it is written +> --,"SELECT distributors.* WHERE distributors.name = 'Westward';" > ] diff --git a/tools/Language/SQL/SimpleSQL/SQL2003.lhs b/tools/Language/SQL/SimpleSQL/SQL2003.lhs index 7ba0c31..7abaa98 100644 --- a/tools/Language/SQL/SimpleSQL/SQL2003.lhs +++ b/tools/Language/SQL/SimpleSQL/SQL2003.lhs @@ -8,18 +8,46 @@ We are only interested in the query syntax, goes through sections 5-10 The goal is to create some coverage tests to get close to supporting a large amount of the SQL. -> module Language.SQL.SimpleSQL.SQL2003 where +> module Language.SQL.SimpleSQL.SQL2003 (sql2003Tests) where > import Language.SQL.SimpleSQL.TestTypes > import Language.SQL.SimpleSQL.Syntax - +> sql2003Tests :: TestItem +> sql2003Tests = Group "sql2003Tests" +> [stringLiterals +> ,nationalCharacterStringLiterals +> ,unicodeStringLiterals +> ,binaryStringLiterals +> ,numericLiterals +> ,dateAndTimeLiterals +> ,booleanLiterals +> ,identifiers +> ,typeNames +> ,parenthesizedValueExpression +> ,targetSpecification +> ,contextuallyTypeValueSpec +> ,nextValueExpression +> ,arrayElementReference +> ,multisetElementReference +> ,numericValueExpression +> ,booleanValueExpression +> ,arrayValueConstructor +> ,tableValueConstructor +> ,fromClause +> ,whereClause +> ,groupbyClause +> ,querySpecification +> ,queryExpressions +> ,sortSpecificationList +> ] = 5 Lexical Elements Basic definitions of characters used, tokens, symbols, etc. Most of this section would normally be handled within the lexical analyzer rather than in the grammar proper. Further, the original document does not quote the various single characters, which makes it hard to process automatically. -[There seems to be a lot of unused stuff here, so skip this section and only do bits which +[There seems to be a lot of unused stuff here, so skip this section +and only do bits which are needed by other bits] 5.1 (p151) @@ -488,8 +516,8 @@ standards to include everything that was dropped also? TODO: how to escapes work here? -> bitBinaryStringLiterals :: TestItem -> bitBinaryStringLiterals = Group "bit and hex string literals" $ map (uncurry TestValueExpr) +> binaryStringLiterals :: TestItem +> binaryStringLiterals = Group "bit and hex string literals" $ map (uncurry TestValueExpr) > [("B'101010'", undefined) > ,("X'7f7f7f'", undefined) > ] @@ -1031,11 +1059,12 @@ TODO: review how the special keywords are parsed and add tests for these > targetSpecification :: TestItem > targetSpecification = Group "target specification" $ map (uncurry TestValueExpr) > [(":hostparam", undefined) +> ,(":hostparam indicator :another_host_param", undefined) > ,("?", undefined) > ,(":h[3]", undefined) > ] -TODO: modules stuff, indicators, not sure what current_collation is +TODO: modules stuff, not sure what current_collation is for or how it works @@ -1849,7 +1878,7 @@ Specify a set of s to be constructed into a table. ::= [ { }... ] > tableValueConstructor :: TestItem -> tableValueConstructor = Group "table value constructor" $ map (uncurry TestValueExpr) +> tableValueConstructor = Group "table value constructor" $ map (uncurry TestQueryExpr) > [("values (1,2), (a+b,(select count(*) from t));", undefined) > ] @@ -1869,7 +1898,7 @@ Specify a table or a grouped table. TODO: expand on these tests and review uncovered grammar > fromClause :: TestItem -> fromClause = Group "from clause" $ map (uncurry TestValueExpr) +> fromClause = Group "from clause" $ map (uncurry TestQueryExpr) > [("select * from t,u", undefined) > ,("select * from t as a", undefined) @@ -1990,7 +2019,7 @@ Specify a table derived by the application of a to the result ::= WHERE > whereClause :: TestItem -> whereClause = Group "where clause" $ map (uncurry TestValueExpr) +> whereClause = Group "where clause" $ map (uncurry TestQueryExpr) > [("select * from t where a = 5", undefined)] @@ -2042,7 +2071,7 @@ It seems even in sql 2003, you can only put column references in the groups, and not general value expressions. > groupbyClause :: TestItem -> groupbyClause = Group "group by clause" $ map (uncurry TestValueExpr) +> groupbyClause = Group "group by clause" $ map (uncurry TestQueryExpr) > [("select a, sum(b) from t group by a", undefined) > ,("select a, c,sum(b) from t group by a,c", undefined) > ,("select a, c,sum(b) from t group by a,c collate x", undefined) @@ -2146,7 +2175,7 @@ TODO: review this and add more variants > querySpecification :: TestItem -> querySpecification = Group "query specification" $ map (uncurry TestValueExpr) +> querySpecification = Group "query specification" $ map (uncurry TestQueryExpr) > [("select a from t", undefined) > ,("select all a from t", undefined) > ,("select distinct a from t", undefined) @@ -2206,7 +2235,7 @@ TODO: common table expressions ::= > queryExpressions :: TestItem -> queryExpressions = Group "query expressions" $ map (uncurry TestValueExpr) +> queryExpressions = Group "query expressions" $ map (uncurry TestQueryExpr) > [("select a from t union select a from u", undefined) > ,("select a from t union all select a from u", undefined) @@ -2784,7 +2813,7 @@ Specify a sort order. TODO: review sort specifications > sortSpecificationList :: TestItem -> sortSpecificationList = Group "sort specification list" $ map (uncurry TestValueExpr) +> sortSpecificationList = Group "sort specification list" $ map (uncurry TestQueryExpr) > [("select * from t order by a", undefined) > ,("select * from t order by a,b", undefined) > ,("select * from t order by a asc,b", undefined) @@ -2794,3 +2823,7 @@ TODO: review sort specifications > ] TODO: what happened to the collation in order by? +Answer: sort used to be a column reference with an optional +collate. Since it is now a value expression, the collate doesn't need +to be mentioned here. + diff --git a/tools/Language/SQL/SimpleSQL/Tests.lhs b/tools/Language/SQL/SimpleSQL/Tests.lhs index 87439f0..ea655b4 100644 --- a/tools/Language/SQL/SimpleSQL/Tests.lhs +++ b/tools/Language/SQL/SimpleSQL/Tests.lhs @@ -28,7 +28,7 @@ test data to the Test.Framework tests. > import Language.SQL.SimpleSQL.ValueExprs > import Language.SQL.SimpleSQL.Tpch - +> import Language.SQL.SimpleSQL.SQL2003 Order the tests to start from the simplest first. This is also the order on the generated documentation. @@ -44,6 +44,7 @@ order on the generated documentation. > ,fullQueriesTests > ,postgresTests > ,tpchTests +> ,sql2003Tests > ] > tests :: Test.Framework.Test diff --git a/tools/ShowErrors.lhs b/tools/ShowErrors.lhs new file mode 100644 index 0000000..6a0edf0 --- /dev/null +++ b/tools/ShowErrors.lhs @@ -0,0 +1,7 @@ + + +> import Language.SQL.SimpleSQL.ErrorMessages + + +> main :: IO () +> main = putStrLn $ pExprs valueExpressions queryExpressions