get old lexing code working again, now only 3 tests fail

2024-01-10 11:28:34 +00:00 · 2024-01-10 11:28:34 +00:00 · 4e09fe9f45
commit 4e09fe9f45
parent 0f307f51c7
5 changed files with 167 additions and 269 deletions
--- a/Language/SQL/SimpleSQL/Lex.hs
+++ b/Language/SQL/SimpleSQL/Lex.hs
@ -19,6 +19,21 @@ directly without the separately testing lexing stage.

 TODO:

+optimisations:
+
+check for left factor opportunities
+check for places where it parses a few substrings from the source,
+  then puts them back together with a concatenate of some flavour
+  -> this is better if can find a way to parse the entire string
+  from the source and lift it in one go into the lexical token
+before this is done, a smaller optimisation is when any code matches
+  a constant string in the lexer, use that constant string instead
+  of the string from the parser, it might make a small difference in
+  a few places
+maybe every token should carry the exact source as well as any fields
+  it's been broken into - so pretty printing is trivial
+
+
 make the tokenswill print more dialect accurate. Maybe add symbol
  chars and identifier chars to the dialect definition and use them from
  here
@ -98,12 +113,19 @@ import Text.Megaparsec
    ,many
    ,try
    ,option
+    ,(<|>)
+    ,notFollowedBy
+    ,manyTill
+    ,anySingle
+    ,lookAhead
    )
+import qualified Text.Megaparsec as M
 import Text.Megaparsec.Char
    (string
    ,char
    )
 import Text.Megaparsec.State (initialState)
+import Control.Applicative ((<**>))

 import Data.Void (Void)

@ -113,7 +135,7 @@ import Data.Char
    ,isSpace
    ,isDigit
    )
-import Control.Monad (void)
+import Control.Monad (void, guard)
 import Data.Text (Text)
 import qualified Data.Text as T

@ -241,7 +263,7 @@ sqlToken d = do
         ,blockComment d
         ,sqlNumber d
         ,positionalArg d
-         --,dontParseEndBlockComment d
+         ,dontParseEndBlockComment d
         ,prefixedVariable d
         ,symbol d
         ,sqlWhitespace d]
@ -251,10 +273,6 @@ sqlToken d = do

 --------------------------------------

-sqlString :: Dialect -> Parser Token
-sqlString _d =
-    SqlString "'" "'" <$> (char_ '\'' *> takeWhileP (Just "non quote char") (/= '\'') <* char_ '\'')
-
 {-
 Parse a SQL string. Examples:

@ -265,7 +283,6 @@ b'binary string'
 x'hexidecimal string'
 -}

-{-
 sqlString :: Dialect -> Parser Token
 sqlString d = dollarString <|> csString <|> normalString
  where
@ -273,21 +290,21 @@ sqlString d = dollarString <|> csString <|> normalString
        guard $ diDollarString d
        -- use try because of ambiguity with symbols and with
        -- positional arg
-        delim <- (\x -> concat ["$",x,"$"])
+        delim <- (\x -> T.concat ["$",x,"$"])
                 <$> try (char '$' *> option "" identifierString <* char '$')
-        SqlString delim delim  <$> manyTill anyChar (try $ string delim)
+        SqlString delim delim . T.pack <$> manyTill anySingle (try $ string delim)
    normalString = SqlString "'" "'" <$> (char '\'' *> normalStringSuffix False "")
    normalStringSuffix allowBackslash t = do
-        s <- takeTill $ if allowBackslash
-                        then (`elem` "'\\")
-                        else (== '\'')
+        s <- takeWhileP Nothing $ if allowBackslash
+                                  then (`notElemChar` "'\\")
+                                  else (/= '\'')
        -- deal with '' or \' as literal quote character
        choice [do
                ctu <- choice ["''" <$ try (string "''")
                              ,"\\'" <$ string "\\'"
                              ,"\\" <$ char '\\']
-                normalStringSuffix allowBackslash $ concat [t,s,ctu]
-               ,concat [t,s] <$ char '\'']
+                normalStringSuffix allowBackslash $ T.concat [t,s,ctu]
+               ,T.concat [t,s] <$ char '\'']
    -- try is used to to avoid conflicts with
    -- identifiers which can start with n,b,x,u
    -- once we read the quote type and the starting '
@ -299,38 +316,19 @@ sqlString d = dollarString <|> csString <|> normalString
    csString
      | diEString d =
        choice [SqlString <$> try (string "e'" <|> string "E'")
-                          <*> return "'" <*> normalStringSuffix True ""
+                          <*> pure "'" <*> normalStringSuffix True ""
               ,csString']
      | otherwise = csString'
    csString' = SqlString
                <$> try cs
-                <*> return "'"
+                <*> pure "'"
                <*> normalStringSuffix False ""
-    csPrefixes = "nNbBxX"
-    cs = choice $ (map (\x -> string ([x] ++ "'")) csPrefixes)
-                  ++ [string "u&'"
-                     ,string "U&'"]
-}
+    csPrefixes = (map (flip T.cons "'") "nNbBxX") ++ ["u&'", "U&'"]
+    cs :: Parser Text
+    cs = choice $ map string csPrefixes

 --------------------------------------

-- TODO: this reconstitutes the string from bits, instead of lifting
-- it in one piece from the source. This is a performance issue, not
-- sure if it will be significant. The same comment applies to most of
-- the other parsers
-identifier :: Dialect -> Parser Token
-identifier d = Identifier Nothing <$> identifierString d
-
-identifierString :: Dialect -> Parser Text
-identifierString _ = (do
-    c <- satisfy isFirstLetter
-    choice
-        [T.cons c <$> (takeWhileP (Just "identifier char") isNonFirstLetter)
-        ,pure $ T.singleton c]) <?> "identifier"
-  where
-     isFirstLetter c = c == '_' || isAlpha c
-     isNonFirstLetter c = c == '_' || isAlphaNum c
-
 {-
 Parses identifiers:

@ -341,7 +339,6 @@ u&"unicode quoted identifier"
 `mysql quoted identifier`
 -}

-{-
 identifier :: Dialect -> Parser Token
 identifier d =
    choice
@ -355,50 +352,39 @@ identifier d =
    regularIden = Identifier Nothing <$> identifierString
    quotedIden = Identifier (Just ("\"","\"")) <$> qidenPart
    mySqlQuotedIden = Identifier (Just ("`","`"))
-                      <$> (char '`' *> takeWhile1 (/='`') <* char '`')
+                      <$> (char '`' *> takeWhile1P Nothing (/='`') <* char '`')
    sqlServerQuotedIden = Identifier (Just ("[","]"))
-                          <$> (char '[' *> takeWhile1 (`notElem` "[]") <* char ']')
+                          <$> (char '[' *> takeWhile1P Nothing (`notElemChar` "[]") <* char ']')
    -- try is used here to avoid a conflict with identifiers
    -- and quoted strings which also start with a 'u'
    unicodeQuotedIden = Identifier
-                        <$> (f <$> try (oneOf "uU" <* string "&"))
+                        <$> (f <$> try ((oneOf "uU") <* string "&"))
                        <*> qidenPart
-      where f x = Just (x: "&\"", "\"")
+      where f x = Just (T.cons x "&\"", "\"")
    qidenPart = char '"' *> qidenSuffix ""
    qidenSuffix t = do
-        s <- takeTill (=='"')
+        s <- takeWhileP Nothing (/='"')
        void $ char '"'
        -- deal with "" as literal double quote character
        choice [do
                void $ char '"'
-                qidenSuffix $ concat [t,s,"\"\""]
-               ,return $ concat [t,s]]
+                qidenSuffix $ T.concat [t,s,"\"\""]
+               ,pure $ T.concat [t,s]]

-
-- This parses a valid identifier without quotes.
-
-identifierString :: Parser String
-identifierString =
-    startsWith (\c -> c == '_' || isAlpha c) isIdentifierChar
-
-- this can be moved to the dialect at some point
+identifierString :: Parser Text
+identifierString = (do
+    c <- satisfy isFirstLetter
+    choice
+        [T.cons c <$> (takeWhileP (Just "identifier char") isIdentifierChar)
+        ,pure $ T.singleton c]) <?> "identifier"
+  where
+     isFirstLetter c = c == '_' || isAlpha c

 isIdentifierChar :: Char -> Bool
 isIdentifierChar c = c == '_' || isAlphaNum c

-}
-
 --------------------------------------

-{-
-I think it's always faster to use a string locally created in the parser code,
-than to use one taken from the parsed source, unless you take it without modifying it,
-the example here is using -- and \n. this won't be needed in this case if can work out
-how to lift the entire comment as a single string from the source.
-
-this concept does apply to things like symbols
-}
-
 lineComment :: Dialect -> Parser Token
 lineComment _ = do
    try (string_ "--") <?> ""
@ -407,62 +393,22 @@ lineComment _ = do
    suf <- option "" ("\n" <$ char_ '\n')
    pure $ LineComment $ T.concat ["--", rest, suf]

-{-lineComment :: Dialect -> Parser Token
-lineComment _ =
-    (\s -> LineComment $ concat ["--",s]) <$>
-    -- try is used here in case we see a - symbol
-    -- once we read two -- then we commit to the comment token
-    (try (string "--") *> (
-        -- todo: there must be a better way to do this
-     conc <$> manyTill anyChar (lookAhead lineCommentEnd) <*> lineCommentEnd))
-  where
-    conc a Nothing = a
-    conc a (Just b) = a ++ b
-    lineCommentEnd =
-        Just "\n" <$ char '\n'
-        <|> Nothing <$ eof-}
-
 --------------------------------------

+-- TODO: the parser before the switch to megaparsec parsed nested block comments
+-- I don't know any dialects that use this, but I think it's useful, if needed,
+-- add it back in under a dialect flag?
 blockComment :: Dialect -> Parser Token
 blockComment _ = (do
    try $ string_ "/*"
    BlockComment . T.concat . ("/*":) <$> more) <?> ""
  where
    more = choice
-        [["*/"] <$ try (string_ "*/")
-        ,char_ '*' *> (("*":) <$> more)
+        [["*/"] <$ try (string_ "*/")  -- comment ended
+        ,char_ '*' *> (("*":) <$> more) -- comment contains * but this isn't the comment end token
+        -- not sure if there's an easy optimisation here
        ,(:) <$> takeWhile1P (Just "non comment terminator text") (/= '*') <*> more]

-
-
-{-
-Try is used in the block comment for the two symbol bits because we
-want to backtrack if we read the first symbol but the second symbol
-isn't there.
-}
-{-
-blockComment :: Dialect -> Parser Token
-blockComment _ =
-    (\s -> BlockComment $ concat ["/*",s]) <$>
-    (try (string "/*") *> commentSuffix 0)
-  where
-    commentSuffix :: Int -> Parser String
-    commentSuffix n = do
-      -- read until a possible end comment or nested comment
-      x <- takeWhile (\e -> e /= '/' && e /= '*')
-      choice [-- close comment: if the nesting is 0, done
-              -- otherwise recurse on commentSuffix
-              try (string "*/") *> let t = concat [x,"*/"]
-                                   in if n == 0
-                                      then return t
-                                      else (\s -> concat [t,s]) <$> commentSuffix (n - 1)
-              -- nested comment, recurse
-             ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
-              -- not an end comment or nested comment, continue
-             ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
-}
-
 {-
 This is to improve user experience: provide an error if we see */
 outside a comment. This could potentially break postgres ops with */
@ -470,23 +416,14 @@ in them (which is a stupid thing to do). In other cases, the user
 should write * / instead (I can't think of any cases when this would
 be valid syntax though).
 -}
-{-
+
 dontParseEndBlockComment :: Dialect -> Parser Token
 dontParseEndBlockComment _ =
    -- don't use try, then it should commit to the error
    try (string "*/") *> fail "comment end without comment start"
-}
+
 --------------------------------------

-sqlNumber :: Dialect -> Parser Token
-sqlNumber _ =
-    SqlNumber <$> digits
-
-
-digits :: Parser Text
-digits = takeWhile1P (Just "digit") isDigit
-
-
 {-
 numbers

@ -502,9 +439,18 @@ present. There cannot be any spaces or other characters embedded in
 the constant. Note that any leading plus or minus sign is not actually
 considered part of the constant; it is an operator applied to the
 constant.
+
+
+algorithm:
+either
+  parse 1 or more digits
+    then an optional dot which isn't two dots
+    then optional digits
+  or: parse a dot which isn't two dots
+    then digits
+followed by an optional exponent
 -}

-{-
 sqlNumber :: Dialect -> Parser Token
 sqlNumber d =
    SqlNumber <$> completeNumber
@ -517,15 +463,14 @@ sqlNumber d =
              ]
  where
    completeNumber =
-      (int <??> (pp dot <??.> pp int)
+      (digits <??> (pp dot <??.> pp digits)
      -- try is used in case we read a dot
      -- and it isn't part of a number
      -- if there are any following digits, then we commit
      -- to it being a number and not something else
-      <|> try ((++) <$> dot <*> int))
+      <|> try ((<>) <$> dot <*> digits))
      <??> pp expon

-    int = many1 digit
    -- make sure we don't parse two adjacent dots in a number
    -- special case for postgresql, we backtrack if we see two adjacent dots
    -- to parse 1..2, but in other dialects we commit to the failure
@ -533,23 +478,25 @@ sqlNumber d =
          in if diPostgresSymbols d
             then try p
             else p
-    expon = (:) <$> oneOf "eE" <*> sInt
-    sInt = (++) <$> option "" (string "+" <|> string "-") <*> int
-    pp = (<$$> (++))
-}
+    expon = T.cons <$> oneOf "eE" <*> sInt
+    sInt = (<>) <$> option "" (T.singleton <$> oneOf "+-") <*> digits
+    pp = (<$$> (<>))
+    p <??> q = p <**> option id q
+    pa <$$> c = pa <**> pure (flip c)
+    pa <??.> pb =
+       let c = (<$>) . flip
+       in (.) `c` pa <*> option id pb
+
+digits :: Parser Text
+digits = takeWhile1P (Just "digit") isDigit

 --------------------------------------

-positionalArg :: Dialect -> Parser Token
-positionalArg _ = PositionalArg <$> (char_ '$' *> (read . T.unpack <$> digits))
-
-{-
 positionalArg :: Dialect -> Parser Token
 positionalArg d =
-  guard (diPositionalArg d) >>
-  -- use try to avoid ambiguities with other syntax which starts with dollar
-  PositionalArg <$> try (char '$' *> (read <$> many1 digit))
-}
+    guard (diPositionalArg d) >>
+    -- use try to avoid ambiguities with other syntax which starts with dollar
+    PositionalArg <$> try (char_ '$' *> (read . T.unpack <$> digits))

 --------------------------------------

@ -557,36 +504,15 @@ positionalArg d =
 -- identifier char, then commit
 prefixedVariable :: Dialect -> Parser Token
 prefixedVariable d = try $ choice
-    [PrefixedVariable <$> (':' <$ char_ ':') <*> identifierString d
-    ]
-
-
-- use try because : and @ can be part of other things also
-
-{-
-prefixedVariable :: Dialect -> Parser Token
-prefixedVariable  d = try $ choice
    [PrefixedVariable <$> char ':' <*> identifierString
    ,guard (diAtIdentifier d) >>
     PrefixedVariable <$> char '@' <*> identifierString
    ,guard (diHashIdentifier d) >>
     PrefixedVariable <$> char '#' <*> identifierString
    ]
-}

 --------------------------------------

-symbol :: Dialect -> Parser Token
-symbol _ =
-    Symbol <$> choice
-    [try $ choice $ map (\a -> string a) multiCharSymbols
-    ,T.singleton <$> satisfy (`elem` singleLetterSymbol)
-    ]
-  where
-    singleLetterSymbol = "(),-+*/<>=." :: String
-    multiCharSymbols = ["!=", "<>", ">=", "<=", "||"]
-
-
 {-
 Symbols

@ -596,7 +522,7 @@ A symbol is an operator, or one of the misc symbols which include:
 The postgresql operator syntax allows a huge range of operators
 compared with ansi and other dialects
 -}
-{-
+
 symbol :: Dialect -> Parser Token
 symbol d  = Symbol <$> choice (concat
   [dots
@ -610,14 +536,14 @@ symbol d  = Symbol <$> choice (concat
    else basicAnsiOps
   ])
 where
-   dots = [many1 (char '.')]
+   dots = [takeWhile1P (Just "dot") (=='.')]
   odbcSymbol = [string "{", string "}"]
   postgresExtraSymbols =
       [try (string ":=")
        -- parse :: and : and avoid allowing ::: or more
       ,try (string "::" <* notFollowedBy (char ':'))
       ,try (string ":" <* notFollowedBy (char ':'))]
-   miscSymbol = map (string . (:[])) $
+   miscSymbol = map (string . T.singleton) $
                case () of
                    _ | diSqlServerSymbols d -> ",;():?"
                      | diPostgresSymbols d -> "[],;()"
@ -629,14 +555,14 @@ symbols can also be part of a single character symbol
 -}

   basicAnsiOps = map (try . string) [">=","<=","!=","<>"]
-                  ++ map (string . (:[])) "+-^*/%~&<>="
+                  ++ map (string . T.singleton) "+-^*/%~&<>="
                  ++ pipes
   pipes = -- what about using many1 (char '|'), then it will
           -- fail in the parser? Not sure exactly how
           -- standalone the lexer should be
           [char '|' *>
            choice ["||" <$ char '|' <* notFollowedBy (char '|')
-                   ,return "|"]]
+                   ,pure "|"]]

 {-
 postgresql generalized operators
@ -662,7 +588,7 @@ which allows the last character of a multi character symbol to be + or
 -
 -}

-generalizedPostgresqlOperator :: [Parser String]
+generalizedPostgresqlOperator :: [Parser Text]
 generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
  where
    allOpSymbols = "+-*/<>=~!@#%^&|`?"
@ -674,13 +600,13 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
    singlePlusMinus = try $ do
      c <- oneOf "+-"
      notFollowedBy $ oneOf allOpSymbols
-      return [c]
+      pure $ T.singleton c

    -- this is used when we are parsing a potentially multi symbol
    -- operator and we have alread seen one of the 'exception chars'
    -- and so we can end with a + or -
    moreOpCharsException = do
-       c <- oneOf (filter (`notElem` "-/*") allOpSymbols)
+       c <- oneOf (filter (`notElemChar` "-/*") allOpSymbols)
            -- make sure we don't parse a comment starting token
            -- as part of an operator
            <|> try (char '/' <* notFollowedBy (char '*'))
@ -688,14 +614,14 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
            -- and make sure we don't parse a block comment end
            -- as part of another symbol
            <|> try (char '*' <* notFollowedBy (char '/'))
-       (c:) <$> option [] moreOpCharsException
+       T.cons c <$> option "" moreOpCharsException

    opMoreChars = choice
       [-- parse an exception char, now we can finish with a + -
-        (:)
+        T.cons
        <$> oneOf exceptionOpSymbols
-        <*> option [] moreOpCharsException
-       ,(:)
+        <*> option "" moreOpCharsException
+       ,T.cons
        <$> (-- parse +, make sure it isn't the last symbol
             try (char '+' <* lookAhead (oneOf allOpSymbols))
             <|> -- parse -, make sure it isn't the last symbol
@ -709,18 +635,14 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
             try (char '*' <* notFollowedBy (char '/'))
             <|> -- any other ansi operator symbol
             oneOf "<>=")
-        <*> option [] opMoreChars
+        <*> option "" opMoreChars
       ]
-}

 --------------------------------------

 sqlWhitespace :: Dialect -> Parser Token
 sqlWhitespace _ = Whitespace <$> takeWhile1P (Just "whitespace") isSpace <?> ""

--sqlWhitespace :: Dialect -> Parser Token
--sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
-
 ----------------------------------------------------------------------------

 -- parser helpers
@ -731,25 +653,11 @@ char_ = void . char
 string_ :: Text -> Parser ()
 string_ = void . string

-{-
-startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
-startsWith p ps = do
-  c <- satisfy p
-  choice [(:) c <$> (takeWhile1 ps)
-         ,return [c]]
+oneOf :: [Char] -> Parser Char
+oneOf = M.oneOf

-takeWhile1 :: (Char -> Bool) -> Parser String
-takeWhile1 p = many1 (satisfy p)
-
-takeWhile :: (Char -> Bool) -> Parser String
-takeWhile p = many (satisfy p)
-
-takeTill :: (Char -> Bool) -> Parser String
-takeTill p = manyTill anyChar (peekSatisfy p)
-
-peekSatisfy :: (Char -> Bool) -> Parser ()
-peekSatisfy p = void $ lookAhead (satisfy p)
-}
+notElemChar :: Char -> [Char] -> Bool
+notElemChar a b = a `notElem` (b :: [Char])

 ----------------------------------------------------------------------------

@ -776,8 +684,7 @@ successes. I don't think it succeeds this test at the moment
 -- will pretty print then lex back to the same set of tokens.
 -- Used internally, might be useful for generating SQL via lexical tokens.
 tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool
-tokenListWillPrintAndLex = undefined
-{-tokenListWillPrintAndLex _ [] = True
+tokenListWillPrintAndLex _ [] = True
 tokenListWillPrintAndLex _ [_] = True
 tokenListWillPrintAndLex d (a:b:xs) =
    tokensWillPrintAndLex d a b && tokenListWillPrintAndLex d (b:xs)
@ -791,7 +698,7 @@ followed by = or : makes a different symbol
 -}

    | Symbol ":" <- a
-    , checkFirstBChar (\x -> isIdentifierChar x || x `elem` ":=") = False
+    , checkFirstBChar (\x -> isIdentifierChar x || x `T.elem` ":=") = False

 {-
 two symbols next to eachother will fail if the symbols can combine and
@ -801,7 +708,7 @@ two symbols next to eachother will fail if the symbols can combine and
    | diPostgresSymbols d
    , Symbol a' <- a
    , Symbol b' <- b
-    , b' `notElem` ["+", "-"] || or (map (`elem` a') "~!@#%^&|`?") = False
+    , b' `notElem` ["+", "-"] || or (map (`T.elem` a') "~!@#%^&|`?") = False

 {-
 check two adjacent symbols in non postgres where the combination
@ -906,17 +813,13 @@ TODO: not 100% on this always being bad
    -- helper function to run a predicate on the
    -- last character of the first token and the first
    -- character of the second token
-    checkBorderChars f
-        | (_:_) <- prettya
-        , (fb:_) <- prettyb
-        , la <- last prettya
-        = f la fb
-    checkBorderChars _ = False
-    checkFirstBChar f = case prettyb of
-                          (b':_) -> f b'
-                          _ -> False
-    checkLastAChar f = case prettya of
-                          (_:_) -> f $ last prettya
-                          _ -> False
-
-}
+    checkBorderChars f =
+        case (T.unsnoc prettya, T.uncons prettyb) of
+            (Just (_,la), Just (fb,_)) -> f la fb
+            _ -> False
+    checkFirstBChar f = case T.uncons prettyb of
+        Just (b',_) -> f b'
+        _ -> False
+    checkLastAChar f = case T.unsnoc prettya of
+        Just (_,la) -> f la
+        _ -> False
--- a/tools/Language/SQL/SimpleSQL/LexerTests.hs
+++ b/tools/Language/SQL/SimpleSQL/LexerTests.hs
@ -24,10 +24,8 @@ import Language.SQL.SimpleSQL.Lex
    ,tokenListWillPrintAndLex
    )

-import Language.SQL.SimpleSQL.Dialect
-    (ansi2011)
-
 import qualified Data.Text as T
+import Data.Text (Text)
    
 --import Debug.Trace
 --import Data.Char (isAlpha)
@ -35,12 +33,13 @@ import qualified Data.Text as T

 lexerTests :: TestItem
 lexerTests = Group "lexerTests" $
-    [bootstrapTests{-Group "lexer token tests" [ansiLexerTests
-                               ,postgresLexerTests
-                               ,sqlServerLexerTests
-                               ,oracleLexerTests
-                               ,mySqlLexerTests
-                               ,odbcLexerTests]-}]
+    [bootstrapTests
+    ,ansiLexerTests
+    ,postgresLexerTests
+    ,sqlServerLexerTests
+    ,oracleLexerTests
+    ,mySqlLexerTests
+    ,odbcLexerTests]

 -- quick sanity tests to see something working
 bootstrapTests :: TestItem
@ -75,8 +74,9 @@ bootstrapTests = Group "bootstrap tests" $
    ,("1", [SqlNumber "1"])
    ,("42", [SqlNumber "42"])

-    ,("$1", [PositionalArg 1])
-    ,("$200", [PositionalArg 200])
+    -- have to fix the dialect handling in the tests
+    --,("$1", [PositionalArg 1])
+    --,("$200", [PositionalArg 200])

    ,(":test", [PrefixedVariable ':' "test"])

@ -84,22 +84,22 @@ bootstrapTests = Group "bootstrap tests" $
     ["!=", "<>", ">=", "<=", "||"]
     ++ map T.singleton ("(),-+*/<>=." :: String)))

-{-
-ansiLexerTable :: [(String,[Token])]
+
+ansiLexerTable :: [(Text,[Token])]
 ansiLexerTable =
    -- single char symbols
-    map (\s -> ([s],[Symbol [s]])) "+-^*/%~&|?<>[]=,;()"
+    map (\s -> (T.singleton s,[Symbol $ T.singleton s])) "+-^*/%~&|?<>[]=,;()"
    -- multi char symbols
    ++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||"]
    ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"]
        -- simple identifiers
        in map (\i -> (i, [Identifier Nothing i])) idens
-           ++ map (\i -> ("\"" ++ i ++ "\"", [Identifier (Just ("\"","\"")) i])) idens
+           <> map (\i -> ("\"" <> i <> "\"", [Identifier (Just ("\"","\"")) i])) idens
           -- todo: in order to make lex . pretty id, need to
           -- preserve the case of the u
-           ++ map (\i -> ("u&\"" ++ i ++ "\"", [Identifier (Just ("u&\"","\"")) i])) idens
+           <> map (\i -> ("u&\"" <> i <> "\"", [Identifier (Just ("u&\"","\"")) i])) idens
           -- host param
-           ++ map (\i -> (':':i, [PrefixedVariable ':' i])) idens
+           <> map (\i -> (T.cons ':' i, [PrefixedVariable ':' i])) idens
       )
    -- quoted identifiers with embedded double quotes
    -- the lexer doesn't unescape the quotes
@ -111,7 +111,7 @@ ansiLexerTable =
       ,("'normalendquote '''", [SqlString "'" "'" "normalendquote ''"])
       ,("'\n'", [SqlString "'" "'" "\n"])]
    -- csstrings
-    ++ map (\c -> (c ++ "'test'", [SqlString (c ++ "'") "'" "test"]))
+    ++ map (\c -> (c <> "'test'", [SqlString (c <> "'") "'" "test"]))
       ["n", "N","b", "B","x", "X", "u&"]
    -- numbers
    ++ [("10", [SqlNumber "10"])
@ -122,8 +122,8 @@ ansiLexerTable =
       ,("10.2", [SqlNumber "10.2"])
       ,("10.2e7", [SqlNumber "10.2e7"])]
    -- whitespace
-    ++ concat [[([a],[Whitespace [a]])
-               ,([a,b], [Whitespace [a,b]])]
+    ++ concat [[(T.singleton a,[Whitespace $ T.singleton a])
+               ,(T.singleton a <> T.singleton b, [Whitespace (T.singleton a <> T.singleton b)])]
              | a <- " \n\t", b <- " \n\t"]
    -- line comment
    ++ map (\c -> (c, [LineComment c]))
@ -134,14 +134,15 @@ ansiLexerTable =
       ,"/* this *is/ a comment */"
       ]

+
 ansiLexerTests :: TestItem
 ansiLexerTests = Group "ansiLexerTests" $
    [Group "ansi lexer token tests" $ [LexTest ansi2011 s t |  (s,t) <- ansiLexerTable]
    ,Group "ansi generated combination lexer tests" $
-    [ LexTest ansi2011 (s ++ s1) (t ++ t1)
+    [ LexTest ansi2011 (s <> s1) (t <> t1)
    | (s,t) <- ansiLexerTable
    , (s1,t1) <- ansiLexerTable
-    , tokenListWillPrintAndLex ansi2011 $ t ++ t1
+    , tokenListWillPrintAndLex ansi2011 $ t <> t1

    ]
    ,Group "ansiadhoclexertests" $
@ -185,10 +186,10 @@ assurance.



-postgresLexerTable :: [(String,[Token])]
+postgresLexerTable :: [(Text,[Token])]
 postgresLexerTable =
    -- single char symbols
-    map (\s -> ([s],[Symbol [s]])) "+-^*/%~&|?<>[]=,;():"
+    map (\s -> (T.singleton s,[Symbol $ T.singleton s])) "+-^*/%~&|?<>[]=,;():"
    -- multi char symbols
    ++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||", "::","..",":="]
    -- generic symbols
@ -196,12 +197,12 @@ postgresLexerTable =
    ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"]
        -- simple identifiers
        in map (\i -> (i, [Identifier Nothing i])) idens
-           ++ map (\i -> ("\"" ++ i ++ "\"", [Identifier (Just ("\"","\"")) i])) idens
+           ++ map (\i -> ("\"" <> i <> "\"", [Identifier (Just ("\"","\"")) i])) idens
           -- todo: in order to make lex . pretty id, need to
           -- preserve the case of the u
-           ++ map (\i -> ("u&\"" ++ i ++ "\"", [Identifier (Just ("u&\"","\"")) i])) idens
+           ++ map (\i -> ("u&\"" <> i <> "\"", [Identifier (Just ("u&\"","\"")) i])) idens
           -- host param
-           ++ map (\i -> (':':i, [PrefixedVariable ':' i])) idens
+           ++ map (\i -> (T.cons ':' i, [PrefixedVariable ':' i])) idens
       )
    -- positional var
    ++ [("$1", [PositionalArg 1])]
@ -223,7 +224,7 @@ postgresLexerTable =
       ,("$a$ $$string 3$$ $a$", [SqlString "$a$" "$a$" " $$string 3$$ "])
       ]
    -- csstrings
-    ++ map (\c -> (c ++ "'test'", [SqlString (c ++ "'") "'" "test"]))
+    ++ map (\c -> (c <> "'test'", [SqlString (c <> "'") "'" "test"]))
       ["n", "N","b", "B","x", "X", "u&", "e", "E"]
    -- numbers
    ++ [("10", [SqlNumber "10"])
@ -234,8 +235,8 @@ postgresLexerTable =
       ,("10.2", [SqlNumber "10.2"])
       ,("10.2e7", [SqlNumber "10.2e7"])]
    -- whitespace
-    ++ concat [[([a],[Whitespace [a]])
-               ,([a,b], [Whitespace [a,b]])]
+    ++ concat [[(T.singleton a,[Whitespace $ T.singleton a])
+               ,(T.singleton a <> T.singleton b, [Whitespace $ T.singleton a <> T.singleton b])]
              | a <- " \n\t", b <- " \n\t"]
    -- line comment
    ++ map (\c -> (c, [LineComment c]))
@ -267,24 +268,24 @@ operators without one of the exception chars
 also: do the testing for the ansi compatibility special cases
 -}

-postgresShortOperatorTable :: [(String,[Token])]
+postgresShortOperatorTable :: [(Text,[Token])]
 postgresShortOperatorTable =
    [ (x, [Symbol x]) | x <- someValidPostgresOperators 2]


-postgresExtraOperatorTable :: [(String,[Token])]
+postgresExtraOperatorTable :: [(Text,[Token])]
 postgresExtraOperatorTable =
    [ (x, [Symbol x]) | x <- someValidPostgresOperators 4]


-someValidPostgresOperators :: Int -> [String]
+someValidPostgresOperators :: Int -> [Text]
 someValidPostgresOperators l =
       [ x
       | n <- [1..l]
       , x <- combos "+-*/<>=~!@#%^&|`?" n
-       , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x)
-       , not (last x `elem` "+-")
-         || or (map (`elem` x) "~!@#%^&|`?")
+       , not ("--" `T.isInfixOf` x || "/*" `T.isInfixOf` x || "*/" `T.isInfixOf` x)
+       , not (T.last x `T.elem` "+-")
+         || or (map (`T.elem` x) "~!@#%^&|`?")
       ]

 {-
@ -293,13 +294,13 @@ These are postgres operators, which if followed immediately by a + or
 the + or -.
 -}

-somePostgresOpsWhichWontAddTrailingPlusMinus :: Int -> [String]
+somePostgresOpsWhichWontAddTrailingPlusMinus :: Int -> [Text]
 somePostgresOpsWhichWontAddTrailingPlusMinus l =
       [ x
       | n <- [1..l]
       , x <- combos "+-*/<>=" n
-       , not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x)
-       , not (last x `elem` "+-")
+       , not ("--" `T.isInfixOf` x || "/*" `T.isInfixOf` x || "*/" `T.isInfixOf` x)
+       , not (T.last x `T.elem` "+-")
       ]


@ -310,7 +311,7 @@ postgresLexerTests = Group "postgresLexerTests" $
    ,Group "postgres generated lexer token tests" $
     [LexTest postgres s t | (s,t) <- postgresShortOperatorTable ++ postgresExtraOperatorTable]
    ,Group "postgres generated combination lexer tests" $
-    [ LexTest postgres (s ++ s1) (t ++ t1)
+    [ LexTest postgres (s <> s1) (t <> t1)
    | (s,t) <- postgresLexerTable ++ postgresShortOperatorTable
    , (s1,t1) <- postgresLexerTable ++ postgresShortOperatorTable
    , tokenListWillPrintAndLex postgres $ t ++ t1
@ -344,18 +345,18 @@ postgresLexerTests = Group "postgresLexerTests" $
    ]
 where
   edgeCaseCommentOps =
-     [ (x ++ "/*<test*/", [Symbol x, BlockComment "/*<test*/"])
+     [ (x <> "/*<test*/", [Symbol x, BlockComment "/*<test*/"])
     | x <- eccops
-     , not (last x == '*')
+     , not (T.last x == '*')
     ] ++
-     [ (x ++ "--<test", [Symbol x, LineComment "--<test"])
+     [ (x <> "--<test", [Symbol x, LineComment "--<test"])
     | x <- eccops
-     , not (last x == '-')
+     , not (T.last x == '-')
     ]
   eccops = someValidPostgresOperators 2
   edgeCasePlusMinusOps = concat
-     [ [ (x ++ "+", [Symbol x, Symbol "+"])
-       , (x ++ "-", [Symbol x, Symbol "-"]) ]
+     [ [ (x <> "+", [Symbol x, Symbol "+"])
+       , (x <> "-", [Symbol x, Symbol "-"]) ]
     | x <- somePostgresOpsWhichWontAddTrailingPlusMinus 2
     ]
   edgeCasePlusMinusComments =
@ -365,7 +366,6 @@ postgresLexerTests = Group "postgresLexerTests" $
     ,("+/**/", [Symbol "+", BlockComment "/**/"])
     ]

-
 sqlServerLexerTests :: TestItem
 sqlServerLexerTests = Group "sqlServerLexTests" $
    [ LexTest sqlserver s t | (s,t) <-
@ -393,8 +393,6 @@ odbcLexerTests = Group "odbcLexTests" $
    ++ [LexFails sqlserver {diOdbc = False} "{"
       ,LexFails sqlserver {diOdbc = False} "}"]

-combos :: [a] -> Int -> [[a]]
-combos _ 0 = [[]]
-combos l n = [ x:tl | x <- l, tl <- combos l (n - 1) ]
-
-}
+combos :: [Char] -> Int -> [Text]
+combos _ 0 = [T.empty]
+combos l n = [ T.cons x tl | x <- l, tl <- combos l (n - 1) ]
--- a/tools/Language/SQL/SimpleSQL/SQL2011Queries.hs
+++ b/tools/Language/SQL/SimpleSQL/SQL2011Queries.hs
@ -36,7 +36,6 @@ module Language.SQL.SimpleSQL.SQL2011Queries (sql2011QueryTests) where
 import Language.SQL.SimpleSQL.TestTypes
 import Language.SQL.SimpleSQL.Syntax

-import qualified Data.Text as T
 import Data.Text (Text)

 sql2011QueryTests :: TestItem
--- a/tools/Language/SQL/SimpleSQL/ScalarExprs.hs
+++ b/tools/Language/SQL/SimpleSQL/ScalarExprs.hs
@ -7,8 +7,6 @@ module Language.SQL.SimpleSQL.ScalarExprs (scalarExprTests) where
 import Language.SQL.SimpleSQL.TestTypes
 import Language.SQL.SimpleSQL.Syntax

-import qualified Data.Text as T
-
 scalarExprTests :: TestItem
 scalarExprTests = Group "scalarExprTests"
    [literals
--- a/tools/Language/SQL/SimpleSQL/Tests.hs
+++ b/tools/Language/SQL/SimpleSQL/Tests.hs
@ -58,7 +58,7 @@ testData :: TestItem
 testData =
    Group "parserTest"
    [lexerTests
-    {-,scalarExprTests
+    ,scalarExprTests
    ,odbcTests
    ,queryExprComponentTests
    ,queryExprsTests
@ -76,7 +76,7 @@ testData =
    ,oracleTests
    ,customDialectTests
    ,emptyStatementTests
-    ,createIndexTests-}
+    ,createIndexTests
    ]

 tests :: T.TestTree