From 4e09fe9f450da4fd7133af769461f272e68665f7 Mon Sep 17 00:00:00 2001 From: Jake Wheat Date: Wed, 10 Jan 2024 11:28:34 +0000 Subject: [PATCH] get old lexing code working again, now only 3 tests fail --- Language/SQL/SimpleSQL/Lex.hs | 327 ++++++------------ tools/Language/SQL/SimpleSQL/LexerTests.hs | 102 +++--- .../Language/SQL/SimpleSQL/SQL2011Queries.hs | 1 - tools/Language/SQL/SimpleSQL/ScalarExprs.hs | 2 - tools/Language/SQL/SimpleSQL/Tests.hs | 4 +- 5 files changed, 167 insertions(+), 269 deletions(-) diff --git a/Language/SQL/SimpleSQL/Lex.hs b/Language/SQL/SimpleSQL/Lex.hs index 3e87412..094626b 100644 --- a/Language/SQL/SimpleSQL/Lex.hs +++ b/Language/SQL/SimpleSQL/Lex.hs @@ -19,6 +19,21 @@ directly without the separately testing lexing stage. TODO: +optimisations: + +check for left factor opportunities +check for places where it parses a few substrings from the source, + then puts them back together with a concatenate of some flavour + -> this is better if can find a way to parse the entire string + from the source and lift it in one go into the lexical token +before this is done, a smaller optimisation is when any code matches + a constant string in the lexer, use that constant string instead + of the string from the parser, it might make a small difference in + a few places +maybe every token should carry the exact source as well as any fields + it's been broken into - so pretty printing is trivial + + make the tokenswill print more dialect accurate. Maybe add symbol chars and identifier chars to the dialect definition and use them from here @@ -98,12 +113,19 @@ import Text.Megaparsec ,many ,try ,option + ,(<|>) + ,notFollowedBy + ,manyTill + ,anySingle + ,lookAhead ) +import qualified Text.Megaparsec as M import Text.Megaparsec.Char (string ,char ) import Text.Megaparsec.State (initialState) +import Control.Applicative ((<**>)) import Data.Void (Void) @@ -113,7 +135,7 @@ import Data.Char ,isSpace ,isDigit ) -import Control.Monad (void) +import Control.Monad (void, guard) import Data.Text (Text) import qualified Data.Text as T @@ -241,7 +263,7 @@ sqlToken d = do ,blockComment d ,sqlNumber d ,positionalArg d - --,dontParseEndBlockComment d + ,dontParseEndBlockComment d ,prefixedVariable d ,symbol d ,sqlWhitespace d] @@ -251,10 +273,6 @@ sqlToken d = do -------------------------------------- -sqlString :: Dialect -> Parser Token -sqlString _d = - SqlString "'" "'" <$> (char_ '\'' *> takeWhileP (Just "non quote char") (/= '\'') <* char_ '\'') - {- Parse a SQL string. Examples: @@ -265,7 +283,6 @@ b'binary string' x'hexidecimal string' -} -{- sqlString :: Dialect -> Parser Token sqlString d = dollarString <|> csString <|> normalString where @@ -273,21 +290,21 @@ sqlString d = dollarString <|> csString <|> normalString guard $ diDollarString d -- use try because of ambiguity with symbols and with -- positional arg - delim <- (\x -> concat ["$",x,"$"]) + delim <- (\x -> T.concat ["$",x,"$"]) <$> try (char '$' *> option "" identifierString <* char '$') - SqlString delim delim <$> manyTill anyChar (try $ string delim) + SqlString delim delim . T.pack <$> manyTill anySingle (try $ string delim) normalString = SqlString "'" "'" <$> (char '\'' *> normalStringSuffix False "") normalStringSuffix allowBackslash t = do - s <- takeTill $ if allowBackslash - then (`elem` "'\\") - else (== '\'') + s <- takeWhileP Nothing $ if allowBackslash + then (`notElemChar` "'\\") + else (/= '\'') -- deal with '' or \' as literal quote character choice [do ctu <- choice ["''" <$ try (string "''") ,"\\'" <$ string "\\'" ,"\\" <$ char '\\'] - normalStringSuffix allowBackslash $ concat [t,s,ctu] - ,concat [t,s] <$ char '\''] + normalStringSuffix allowBackslash $ T.concat [t,s,ctu] + ,T.concat [t,s] <$ char '\''] -- try is used to to avoid conflicts with -- identifiers which can start with n,b,x,u -- once we read the quote type and the starting ' @@ -299,38 +316,19 @@ sqlString d = dollarString <|> csString <|> normalString csString | diEString d = choice [SqlString <$> try (string "e'" <|> string "E'") - <*> return "'" <*> normalStringSuffix True "" + <*> pure "'" <*> normalStringSuffix True "" ,csString'] | otherwise = csString' csString' = SqlString <$> try cs - <*> return "'" + <*> pure "'" <*> normalStringSuffix False "" - csPrefixes = "nNbBxX" - cs = choice $ (map (\x -> string ([x] ++ "'")) csPrefixes) - ++ [string "u&'" - ,string "U&'"] --} + csPrefixes = (map (flip T.cons "'") "nNbBxX") ++ ["u&'", "U&'"] + cs :: Parser Text + cs = choice $ map string csPrefixes -------------------------------------- --- TODO: this reconstitutes the string from bits, instead of lifting --- it in one piece from the source. This is a performance issue, not --- sure if it will be significant. The same comment applies to most of --- the other parsers -identifier :: Dialect -> Parser Token -identifier d = Identifier Nothing <$> identifierString d - -identifierString :: Dialect -> Parser Text -identifierString _ = (do - c <- satisfy isFirstLetter - choice - [T.cons c <$> (takeWhileP (Just "identifier char") isNonFirstLetter) - ,pure $ T.singleton c]) "identifier" - where - isFirstLetter c = c == '_' || isAlpha c - isNonFirstLetter c = c == '_' || isAlphaNum c - {- Parses identifiers: @@ -341,7 +339,6 @@ u&"unicode quoted identifier" `mysql quoted identifier` -} -{- identifier :: Dialect -> Parser Token identifier d = choice @@ -355,50 +352,39 @@ identifier d = regularIden = Identifier Nothing <$> identifierString quotedIden = Identifier (Just ("\"","\"")) <$> qidenPart mySqlQuotedIden = Identifier (Just ("`","`")) - <$> (char '`' *> takeWhile1 (/='`') <* char '`') + <$> (char '`' *> takeWhile1P Nothing (/='`') <* char '`') sqlServerQuotedIden = Identifier (Just ("[","]")) - <$> (char '[' *> takeWhile1 (`notElem` "[]") <* char ']') + <$> (char '[' *> takeWhile1P Nothing (`notElemChar` "[]") <* char ']') -- try is used here to avoid a conflict with identifiers -- and quoted strings which also start with a 'u' unicodeQuotedIden = Identifier - <$> (f <$> try (oneOf "uU" <* string "&")) + <$> (f <$> try ((oneOf "uU") <* string "&")) <*> qidenPart - where f x = Just (x: "&\"", "\"") + where f x = Just (T.cons x "&\"", "\"") qidenPart = char '"' *> qidenSuffix "" qidenSuffix t = do - s <- takeTill (=='"') + s <- takeWhileP Nothing (/='"') void $ char '"' -- deal with "" as literal double quote character choice [do void $ char '"' - qidenSuffix $ concat [t,s,"\"\""] - ,return $ concat [t,s]] + qidenSuffix $ T.concat [t,s,"\"\""] + ,pure $ T.concat [t,s]] - --- This parses a valid identifier without quotes. - -identifierString :: Parser String -identifierString = - startsWith (\c -> c == '_' || isAlpha c) isIdentifierChar - --- this can be moved to the dialect at some point +identifierString :: Parser Text +identifierString = (do + c <- satisfy isFirstLetter + choice + [T.cons c <$> (takeWhileP (Just "identifier char") isIdentifierChar) + ,pure $ T.singleton c]) "identifier" + where + isFirstLetter c = c == '_' || isAlpha c isIdentifierChar :: Char -> Bool isIdentifierChar c = c == '_' || isAlphaNum c --} - -------------------------------------- -{- -I think it's always faster to use a string locally created in the parser code, -than to use one taken from the parsed source, unless you take it without modifying it, -the example here is using -- and \n. this won't be needed in this case if can work out -how to lift the entire comment as a single string from the source. - -this concept does apply to things like symbols --} - lineComment :: Dialect -> Parser Token lineComment _ = do try (string_ "--") "" @@ -407,62 +393,22 @@ lineComment _ = do suf <- option "" ("\n" <$ char_ '\n') pure $ LineComment $ T.concat ["--", rest, suf] -{-lineComment :: Dialect -> Parser Token -lineComment _ = - (\s -> LineComment $ concat ["--",s]) <$> - -- try is used here in case we see a - symbol - -- once we read two -- then we commit to the comment token - (try (string "--") *> ( - -- todo: there must be a better way to do this - conc <$> manyTill anyChar (lookAhead lineCommentEnd) <*> lineCommentEnd)) - where - conc a Nothing = a - conc a (Just b) = a ++ b - lineCommentEnd = - Just "\n" <$ char '\n' - <|> Nothing <$ eof-} - -------------------------------------- +-- TODO: the parser before the switch to megaparsec parsed nested block comments +-- I don't know any dialects that use this, but I think it's useful, if needed, +-- add it back in under a dialect flag? blockComment :: Dialect -> Parser Token blockComment _ = (do try $ string_ "/*" BlockComment . T.concat . ("/*":) <$> more) "" where more = choice - [["*/"] <$ try (string_ "*/") - ,char_ '*' *> (("*":) <$> more) + [["*/"] <$ try (string_ "*/") -- comment ended + ,char_ '*' *> (("*":) <$> more) -- comment contains * but this isn't the comment end token + -- not sure if there's an easy optimisation here ,(:) <$> takeWhile1P (Just "non comment terminator text") (/= '*') <*> more] - - -{- -Try is used in the block comment for the two symbol bits because we -want to backtrack if we read the first symbol but the second symbol -isn't there. --} -{- -blockComment :: Dialect -> Parser Token -blockComment _ = - (\s -> BlockComment $ concat ["/*",s]) <$> - (try (string "/*") *> commentSuffix 0) - where - commentSuffix :: Int -> Parser String - commentSuffix n = do - -- read until a possible end comment or nested comment - x <- takeWhile (\e -> e /= '/' && e /= '*') - choice [-- close comment: if the nesting is 0, done - -- otherwise recurse on commentSuffix - try (string "*/") *> let t = concat [x,"*/"] - in if n == 0 - then return t - else (\s -> concat [t,s]) <$> commentSuffix (n - 1) - -- nested comment, recurse - ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1)) - -- not an end comment or nested comment, continue - ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n] --} - {- This is to improve user experience: provide an error if we see */ outside a comment. This could potentially break postgres ops with */ @@ -470,23 +416,14 @@ in them (which is a stupid thing to do). In other cases, the user should write * / instead (I can't think of any cases when this would be valid syntax though). -} -{- + dontParseEndBlockComment :: Dialect -> Parser Token dontParseEndBlockComment _ = -- don't use try, then it should commit to the error try (string "*/") *> fail "comment end without comment start" --} + -------------------------------------- -sqlNumber :: Dialect -> Parser Token -sqlNumber _ = - SqlNumber <$> digits - - -digits :: Parser Text -digits = takeWhile1P (Just "digit") isDigit - - {- numbers @@ -502,9 +439,18 @@ present. There cannot be any spaces or other characters embedded in the constant. Note that any leading plus or minus sign is not actually considered part of the constant; it is an operator applied to the constant. + + +algorithm: +either + parse 1 or more digits + then an optional dot which isn't two dots + then optional digits + or: parse a dot which isn't two dots + then digits +followed by an optional exponent -} -{- sqlNumber :: Dialect -> Parser Token sqlNumber d = SqlNumber <$> completeNumber @@ -517,15 +463,14 @@ sqlNumber d = ] where completeNumber = - (int (pp dot pp int) + (digits (pp dot pp digits) -- try is used in case we read a dot -- and it isn't part of a number -- if there are any following digits, then we commit -- to it being a number and not something else - <|> try ((++) <$> dot <*> int)) + <|> try ((<>) <$> dot <*> digits)) pp expon - int = many1 digit -- make sure we don't parse two adjacent dots in a number -- special case for postgresql, we backtrack if we see two adjacent dots -- to parse 1..2, but in other dialects we commit to the failure @@ -533,23 +478,25 @@ sqlNumber d = in if diPostgresSymbols d then try p else p - expon = (:) <$> oneOf "eE" <*> sInt - sInt = (++) <$> option "" (string "+" <|> string "-") <*> int - pp = (<$$> (++)) --} + expon = T.cons <$> oneOf "eE" <*> sInt + sInt = (<>) <$> option "" (T.singleton <$> oneOf "+-") <*> digits + pp = (<$$> (<>)) + p q = p <**> option id q + pa <$$> c = pa <**> pure (flip c) + pa pb = + let c = (<$>) . flip + in (.) `c` pa <*> option id pb + +digits :: Parser Text +digits = takeWhile1P (Just "digit") isDigit -------------------------------------- -positionalArg :: Dialect -> Parser Token -positionalArg _ = PositionalArg <$> (char_ '$' *> (read . T.unpack <$> digits)) - -{- positionalArg :: Dialect -> Parser Token positionalArg d = - guard (diPositionalArg d) >> - -- use try to avoid ambiguities with other syntax which starts with dollar - PositionalArg <$> try (char '$' *> (read <$> many1 digit)) --} + guard (diPositionalArg d) >> + -- use try to avoid ambiguities with other syntax which starts with dollar + PositionalArg <$> try (char_ '$' *> (read . T.unpack <$> digits)) -------------------------------------- @@ -557,36 +504,15 @@ positionalArg d = -- identifier char, then commit prefixedVariable :: Dialect -> Parser Token prefixedVariable d = try $ choice - [PrefixedVariable <$> (':' <$ char_ ':') <*> identifierString d - ] - - --- use try because : and @ can be part of other things also - -{- -prefixedVariable :: Dialect -> Parser Token -prefixedVariable d = try $ choice [PrefixedVariable <$> char ':' <*> identifierString ,guard (diAtIdentifier d) >> PrefixedVariable <$> char '@' <*> identifierString ,guard (diHashIdentifier d) >> PrefixedVariable <$> char '#' <*> identifierString ] --} -------------------------------------- -symbol :: Dialect -> Parser Token -symbol _ = - Symbol <$> choice - [try $ choice $ map (\a -> string a) multiCharSymbols - ,T.singleton <$> satisfy (`elem` singleLetterSymbol) - ] - where - singleLetterSymbol = "(),-+*/<>=." :: String - multiCharSymbols = ["!=", "<>", ">=", "<=", "||"] - - {- Symbols @@ -596,7 +522,7 @@ A symbol is an operator, or one of the misc symbols which include: The postgresql operator syntax allows a huge range of operators compared with ansi and other dialects -} -{- + symbol :: Dialect -> Parser Token symbol d = Symbol <$> choice (concat [dots @@ -610,14 +536,14 @@ symbol d = Symbol <$> choice (concat else basicAnsiOps ]) where - dots = [many1 (char '.')] + dots = [takeWhile1P (Just "dot") (=='.')] odbcSymbol = [string "{", string "}"] postgresExtraSymbols = [try (string ":=") -- parse :: and : and avoid allowing ::: or more ,try (string "::" <* notFollowedBy (char ':')) ,try (string ":" <* notFollowedBy (char ':'))] - miscSymbol = map (string . (:[])) $ + miscSymbol = map (string . T.singleton) $ case () of _ | diSqlServerSymbols d -> ",;():?" | diPostgresSymbols d -> "[],;()" @@ -629,14 +555,14 @@ symbols can also be part of a single character symbol -} basicAnsiOps = map (try . string) [">=","<=","!=","<>"] - ++ map (string . (:[])) "+-^*/%~&<>=" + ++ map (string . T.singleton) "+-^*/%~&<>=" ++ pipes pipes = -- what about using many1 (char '|'), then it will -- fail in the parser? Not sure exactly how -- standalone the lexer should be [char '|' *> choice ["||" <$ char '|' <* notFollowedBy (char '|') - ,return "|"]] + ,pure "|"]] {- postgresql generalized operators @@ -662,7 +588,7 @@ which allows the last character of a multi character symbol to be + or - -} -generalizedPostgresqlOperator :: [Parser String] +generalizedPostgresqlOperator :: [Parser Text] generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars] where allOpSymbols = "+-*/<>=~!@#%^&|`?" @@ -674,13 +600,13 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars] singlePlusMinus = try $ do c <- oneOf "+-" notFollowedBy $ oneOf allOpSymbols - return [c] + pure $ T.singleton c -- this is used when we are parsing a potentially multi symbol -- operator and we have alread seen one of the 'exception chars' -- and so we can end with a + or - moreOpCharsException = do - c <- oneOf (filter (`notElem` "-/*") allOpSymbols) + c <- oneOf (filter (`notElemChar` "-/*") allOpSymbols) -- make sure we don't parse a comment starting token -- as part of an operator <|> try (char '/' <* notFollowedBy (char '*')) @@ -688,14 +614,14 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars] -- and make sure we don't parse a block comment end -- as part of another symbol <|> try (char '*' <* notFollowedBy (char '/')) - (c:) <$> option [] moreOpCharsException + T.cons c <$> option "" moreOpCharsException opMoreChars = choice [-- parse an exception char, now we can finish with a + - - (:) + T.cons <$> oneOf exceptionOpSymbols - <*> option [] moreOpCharsException - ,(:) + <*> option "" moreOpCharsException + ,T.cons <$> (-- parse +, make sure it isn't the last symbol try (char '+' <* lookAhead (oneOf allOpSymbols)) <|> -- parse -, make sure it isn't the last symbol @@ -709,18 +635,14 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars] try (char '*' <* notFollowedBy (char '/')) <|> -- any other ansi operator symbol oneOf "<>=") - <*> option [] opMoreChars + <*> option "" opMoreChars ] --} -------------------------------------- sqlWhitespace :: Dialect -> Parser Token sqlWhitespace _ = Whitespace <$> takeWhile1P (Just "whitespace") isSpace "" ---sqlWhitespace :: Dialect -> Parser Token ---sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace) - ---------------------------------------------------------------------------- -- parser helpers @@ -731,25 +653,11 @@ char_ = void . char string_ :: Text -> Parser () string_ = void . string -{- -startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String -startsWith p ps = do - c <- satisfy p - choice [(:) c <$> (takeWhile1 ps) - ,return [c]] +oneOf :: [Char] -> Parser Char +oneOf = M.oneOf -takeWhile1 :: (Char -> Bool) -> Parser String -takeWhile1 p = many1 (satisfy p) - -takeWhile :: (Char -> Bool) -> Parser String -takeWhile p = many (satisfy p) - -takeTill :: (Char -> Bool) -> Parser String -takeTill p = manyTill anyChar (peekSatisfy p) - -peekSatisfy :: (Char -> Bool) -> Parser () -peekSatisfy p = void $ lookAhead (satisfy p) --} +notElemChar :: Char -> [Char] -> Bool +notElemChar a b = a `notElem` (b :: [Char]) ---------------------------------------------------------------------------- @@ -776,8 +684,7 @@ successes. I don't think it succeeds this test at the moment -- will pretty print then lex back to the same set of tokens. -- Used internally, might be useful for generating SQL via lexical tokens. tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool -tokenListWillPrintAndLex = undefined -{-tokenListWillPrintAndLex _ [] = True +tokenListWillPrintAndLex _ [] = True tokenListWillPrintAndLex _ [_] = True tokenListWillPrintAndLex d (a:b:xs) = tokensWillPrintAndLex d a b && tokenListWillPrintAndLex d (b:xs) @@ -791,7 +698,7 @@ followed by = or : makes a different symbol -} | Symbol ":" <- a - , checkFirstBChar (\x -> isIdentifierChar x || x `elem` ":=") = False + , checkFirstBChar (\x -> isIdentifierChar x || x `T.elem` ":=") = False {- two symbols next to eachother will fail if the symbols can combine and @@ -801,7 +708,7 @@ two symbols next to eachother will fail if the symbols can combine and | diPostgresSymbols d , Symbol a' <- a , Symbol b' <- b - , b' `notElem` ["+", "-"] || or (map (`elem` a') "~!@#%^&|`?") = False + , b' `notElem` ["+", "-"] || or (map (`T.elem` a') "~!@#%^&|`?") = False {- check two adjacent symbols in non postgres where the combination @@ -906,17 +813,13 @@ TODO: not 100% on this always being bad -- helper function to run a predicate on the -- last character of the first token and the first -- character of the second token - checkBorderChars f - | (_:_) <- prettya - , (fb:_) <- prettyb - , la <- last prettya - = f la fb - checkBorderChars _ = False - checkFirstBChar f = case prettyb of - (b':_) -> f b' - _ -> False - checkLastAChar f = case prettya of - (_:_) -> f $ last prettya - _ -> False - --} + checkBorderChars f = + case (T.unsnoc prettya, T.uncons prettyb) of + (Just (_,la), Just (fb,_)) -> f la fb + _ -> False + checkFirstBChar f = case T.uncons prettyb of + Just (b',_) -> f b' + _ -> False + checkLastAChar f = case T.unsnoc prettya of + Just (_,la) -> f la + _ -> False diff --git a/tools/Language/SQL/SimpleSQL/LexerTests.hs b/tools/Language/SQL/SimpleSQL/LexerTests.hs index e2d3ca2..29bce1b 100644 --- a/tools/Language/SQL/SimpleSQL/LexerTests.hs +++ b/tools/Language/SQL/SimpleSQL/LexerTests.hs @@ -24,10 +24,8 @@ import Language.SQL.SimpleSQL.Lex ,tokenListWillPrintAndLex ) -import Language.SQL.SimpleSQL.Dialect - (ansi2011) - import qualified Data.Text as T +import Data.Text (Text) --import Debug.Trace --import Data.Char (isAlpha) @@ -35,12 +33,13 @@ import qualified Data.Text as T lexerTests :: TestItem lexerTests = Group "lexerTests" $ - [bootstrapTests{-Group "lexer token tests" [ansiLexerTests - ,postgresLexerTests - ,sqlServerLexerTests - ,oracleLexerTests - ,mySqlLexerTests - ,odbcLexerTests]-}] + [bootstrapTests + ,ansiLexerTests + ,postgresLexerTests + ,sqlServerLexerTests + ,oracleLexerTests + ,mySqlLexerTests + ,odbcLexerTests] -- quick sanity tests to see something working bootstrapTests :: TestItem @@ -75,8 +74,9 @@ bootstrapTests = Group "bootstrap tests" $ ,("1", [SqlNumber "1"]) ,("42", [SqlNumber "42"]) - ,("$1", [PositionalArg 1]) - ,("$200", [PositionalArg 200]) + -- have to fix the dialect handling in the tests + --,("$1", [PositionalArg 1]) + --,("$200", [PositionalArg 200]) ,(":test", [PrefixedVariable ':' "test"]) @@ -84,22 +84,22 @@ bootstrapTests = Group "bootstrap tests" $ ["!=", "<>", ">=", "<=", "||"] ++ map T.singleton ("(),-+*/<>=." :: String))) -{- -ansiLexerTable :: [(String,[Token])] + +ansiLexerTable :: [(Text,[Token])] ansiLexerTable = -- single char symbols - map (\s -> ([s],[Symbol [s]])) "+-^*/%~&|?<>[]=,;()" + map (\s -> (T.singleton s,[Symbol $ T.singleton s])) "+-^*/%~&|?<>[]=,;()" -- multi char symbols ++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||"] ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"] -- simple identifiers in map (\i -> (i, [Identifier Nothing i])) idens - ++ map (\i -> ("\"" ++ i ++ "\"", [Identifier (Just ("\"","\"")) i])) idens + <> map (\i -> ("\"" <> i <> "\"", [Identifier (Just ("\"","\"")) i])) idens -- todo: in order to make lex . pretty id, need to -- preserve the case of the u - ++ map (\i -> ("u&\"" ++ i ++ "\"", [Identifier (Just ("u&\"","\"")) i])) idens + <> map (\i -> ("u&\"" <> i <> "\"", [Identifier (Just ("u&\"","\"")) i])) idens -- host param - ++ map (\i -> (':':i, [PrefixedVariable ':' i])) idens + <> map (\i -> (T.cons ':' i, [PrefixedVariable ':' i])) idens ) -- quoted identifiers with embedded double quotes -- the lexer doesn't unescape the quotes @@ -111,7 +111,7 @@ ansiLexerTable = ,("'normalendquote '''", [SqlString "'" "'" "normalendquote ''"]) ,("'\n'", [SqlString "'" "'" "\n"])] -- csstrings - ++ map (\c -> (c ++ "'test'", [SqlString (c ++ "'") "'" "test"])) + ++ map (\c -> (c <> "'test'", [SqlString (c <> "'") "'" "test"])) ["n", "N","b", "B","x", "X", "u&"] -- numbers ++ [("10", [SqlNumber "10"]) @@ -122,8 +122,8 @@ ansiLexerTable = ,("10.2", [SqlNumber "10.2"]) ,("10.2e7", [SqlNumber "10.2e7"])] -- whitespace - ++ concat [[([a],[Whitespace [a]]) - ,([a,b], [Whitespace [a,b]])] + ++ concat [[(T.singleton a,[Whitespace $ T.singleton a]) + ,(T.singleton a <> T.singleton b, [Whitespace (T.singleton a <> T.singleton b)])] | a <- " \n\t", b <- " \n\t"] -- line comment ++ map (\c -> (c, [LineComment c])) @@ -134,14 +134,15 @@ ansiLexerTable = ,"/* this *is/ a comment */" ] + ansiLexerTests :: TestItem ansiLexerTests = Group "ansiLexerTests" $ [Group "ansi lexer token tests" $ [LexTest ansi2011 s t | (s,t) <- ansiLexerTable] ,Group "ansi generated combination lexer tests" $ - [ LexTest ansi2011 (s ++ s1) (t ++ t1) + [ LexTest ansi2011 (s <> s1) (t <> t1) | (s,t) <- ansiLexerTable , (s1,t1) <- ansiLexerTable - , tokenListWillPrintAndLex ansi2011 $ t ++ t1 + , tokenListWillPrintAndLex ansi2011 $ t <> t1 ] ,Group "ansiadhoclexertests" $ @@ -185,10 +186,10 @@ assurance. -postgresLexerTable :: [(String,[Token])] +postgresLexerTable :: [(Text,[Token])] postgresLexerTable = -- single char symbols - map (\s -> ([s],[Symbol [s]])) "+-^*/%~&|?<>[]=,;():" + map (\s -> (T.singleton s,[Symbol $ T.singleton s])) "+-^*/%~&|?<>[]=,;():" -- multi char symbols ++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||", "::","..",":="] -- generic symbols @@ -196,12 +197,12 @@ postgresLexerTable = ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"] -- simple identifiers in map (\i -> (i, [Identifier Nothing i])) idens - ++ map (\i -> ("\"" ++ i ++ "\"", [Identifier (Just ("\"","\"")) i])) idens + ++ map (\i -> ("\"" <> i <> "\"", [Identifier (Just ("\"","\"")) i])) idens -- todo: in order to make lex . pretty id, need to -- preserve the case of the u - ++ map (\i -> ("u&\"" ++ i ++ "\"", [Identifier (Just ("u&\"","\"")) i])) idens + ++ map (\i -> ("u&\"" <> i <> "\"", [Identifier (Just ("u&\"","\"")) i])) idens -- host param - ++ map (\i -> (':':i, [PrefixedVariable ':' i])) idens + ++ map (\i -> (T.cons ':' i, [PrefixedVariable ':' i])) idens ) -- positional var ++ [("$1", [PositionalArg 1])] @@ -223,7 +224,7 @@ postgresLexerTable = ,("$a$ $$string 3$$ $a$", [SqlString "$a$" "$a$" " $$string 3$$ "]) ] -- csstrings - ++ map (\c -> (c ++ "'test'", [SqlString (c ++ "'") "'" "test"])) + ++ map (\c -> (c <> "'test'", [SqlString (c <> "'") "'" "test"])) ["n", "N","b", "B","x", "X", "u&", "e", "E"] -- numbers ++ [("10", [SqlNumber "10"]) @@ -234,8 +235,8 @@ postgresLexerTable = ,("10.2", [SqlNumber "10.2"]) ,("10.2e7", [SqlNumber "10.2e7"])] -- whitespace - ++ concat [[([a],[Whitespace [a]]) - ,([a,b], [Whitespace [a,b]])] + ++ concat [[(T.singleton a,[Whitespace $ T.singleton a]) + ,(T.singleton a <> T.singleton b, [Whitespace $ T.singleton a <> T.singleton b])] | a <- " \n\t", b <- " \n\t"] -- line comment ++ map (\c -> (c, [LineComment c])) @@ -267,24 +268,24 @@ operators without one of the exception chars also: do the testing for the ansi compatibility special cases -} -postgresShortOperatorTable :: [(String,[Token])] +postgresShortOperatorTable :: [(Text,[Token])] postgresShortOperatorTable = [ (x, [Symbol x]) | x <- someValidPostgresOperators 2] -postgresExtraOperatorTable :: [(String,[Token])] +postgresExtraOperatorTable :: [(Text,[Token])] postgresExtraOperatorTable = [ (x, [Symbol x]) | x <- someValidPostgresOperators 4] -someValidPostgresOperators :: Int -> [String] +someValidPostgresOperators :: Int -> [Text] someValidPostgresOperators l = [ x | n <- [1..l] , x <- combos "+-*/<>=~!@#%^&|`?" n - , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x) - , not (last x `elem` "+-") - || or (map (`elem` x) "~!@#%^&|`?") + , not ("--" `T.isInfixOf` x || "/*" `T.isInfixOf` x || "*/" `T.isInfixOf` x) + , not (T.last x `T.elem` "+-") + || or (map (`T.elem` x) "~!@#%^&|`?") ] {- @@ -293,13 +294,13 @@ These are postgres operators, which if followed immediately by a + or the + or -. -} -somePostgresOpsWhichWontAddTrailingPlusMinus :: Int -> [String] +somePostgresOpsWhichWontAddTrailingPlusMinus :: Int -> [Text] somePostgresOpsWhichWontAddTrailingPlusMinus l = [ x | n <- [1..l] , x <- combos "+-*/<>=" n - , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x) - , not (last x `elem` "+-") + , not ("--" `T.isInfixOf` x || "/*" `T.isInfixOf` x || "*/" `T.isInfixOf` x) + , not (T.last x `T.elem` "+-") ] @@ -310,7 +311,7 @@ postgresLexerTests = Group "postgresLexerTests" $ ,Group "postgres generated lexer token tests" $ [LexTest postgres s t | (s,t) <- postgresShortOperatorTable ++ postgresExtraOperatorTable] ,Group "postgres generated combination lexer tests" $ - [ LexTest postgres (s ++ s1) (t ++ t1) + [ LexTest postgres (s <> s1) (t <> t1) | (s,t) <- postgresLexerTable ++ postgresShortOperatorTable , (s1,t1) <- postgresLexerTable ++ postgresShortOperatorTable , tokenListWillPrintAndLex postgres $ t ++ t1 @@ -344,18 +345,18 @@ postgresLexerTests = Group "postgresLexerTests" $ ] where edgeCaseCommentOps = - [ (x ++ "/* "/* "-- "+", [Symbol x, Symbol "+"]) + , (x <> "-", [Symbol x, Symbol "-"]) ] | x <- somePostgresOpsWhichWontAddTrailingPlusMinus 2 ] edgeCasePlusMinusComments = @@ -365,7 +366,6 @@ postgresLexerTests = Group "postgresLexerTests" $ ,("+/**/", [Symbol "+", BlockComment "/**/"]) ] - sqlServerLexerTests :: TestItem sqlServerLexerTests = Group "sqlServerLexTests" $ [ LexTest sqlserver s t | (s,t) <- @@ -393,8 +393,6 @@ odbcLexerTests = Group "odbcLexTests" $ ++ [LexFails sqlserver {diOdbc = False} "{" ,LexFails sqlserver {diOdbc = False} "}"] -combos :: [a] -> Int -> [[a]] -combos _ 0 = [[]] -combos l n = [ x:tl | x <- l, tl <- combos l (n - 1) ] - --} +combos :: [Char] -> Int -> [Text] +combos _ 0 = [T.empty] +combos l n = [ T.cons x tl | x <- l, tl <- combos l (n - 1) ] diff --git a/tools/Language/SQL/SimpleSQL/SQL2011Queries.hs b/tools/Language/SQL/SimpleSQL/SQL2011Queries.hs index f754bc2..5962f5d 100644 --- a/tools/Language/SQL/SimpleSQL/SQL2011Queries.hs +++ b/tools/Language/SQL/SimpleSQL/SQL2011Queries.hs @@ -36,7 +36,6 @@ module Language.SQL.SimpleSQL.SQL2011Queries (sql2011QueryTests) where import Language.SQL.SimpleSQL.TestTypes import Language.SQL.SimpleSQL.Syntax -import qualified Data.Text as T import Data.Text (Text) sql2011QueryTests :: TestItem diff --git a/tools/Language/SQL/SimpleSQL/ScalarExprs.hs b/tools/Language/SQL/SimpleSQL/ScalarExprs.hs index f8cd403..1b5f240 100644 --- a/tools/Language/SQL/SimpleSQL/ScalarExprs.hs +++ b/tools/Language/SQL/SimpleSQL/ScalarExprs.hs @@ -7,8 +7,6 @@ module Language.SQL.SimpleSQL.ScalarExprs (scalarExprTests) where import Language.SQL.SimpleSQL.TestTypes import Language.SQL.SimpleSQL.Syntax -import qualified Data.Text as T - scalarExprTests :: TestItem scalarExprTests = Group "scalarExprTests" [literals diff --git a/tools/Language/SQL/SimpleSQL/Tests.hs b/tools/Language/SQL/SimpleSQL/Tests.hs index 272329e..ee99e31 100644 --- a/tools/Language/SQL/SimpleSQL/Tests.hs +++ b/tools/Language/SQL/SimpleSQL/Tests.hs @@ -58,7 +58,7 @@ testData :: TestItem testData = Group "parserTest" [lexerTests - {-,scalarExprTests + ,scalarExprTests ,odbcTests ,queryExprComponentTests ,queryExprsTests @@ -76,7 +76,7 @@ testData = ,oracleTests ,customDialectTests ,emptyStatementTests - ,createIndexTests-} + ,createIndexTests ] tests :: T.TestTree