switch in megaparsec with stub lexing code

2024-01-09 17:53:12 +00:00 · 2024-01-09 17:53:12 +00:00 · 9396aa8cba
commit 9396aa8cba
parent d80796b1dd
6 changed files with 345 additions and 129 deletions
--- a/Language/SQL/SimpleSQL/Lex.hs
+++ b/Language/SQL/SimpleSQL/Lex.hs
@ -52,38 +52,70 @@ start writing the error message tests:
    still be manual)
 try again to add annotation to the ast
 -}
 -- | Lexer for SQL.
 {-# LANGUAGE TupleSections #-}
 {-# LANGUAGE OverloadedStrings #-}
 module Language.SQL.SimpleSQL.Lex
    (Token(..)
    ,WithPos(..)
    ,lexSQL
    ,prettyToken
    ,prettyTokens
    ,ParseError(..)
    ,prettyError
    ,tokenListWillPrintAndLex
    ,ansi2011
    ) where
 import Language.SQL.SimpleSQL.Dialect
    (Dialect(..)
    ,ansi2011
    )
 import Text.Megaparsec
    (Parsec
    ,runParser'
    ,ParseErrorBundle(..)
    ,errorBundlePretty
    ,SourcePos(..)
    ,getSourcePos
    ,getOffset
    ,pstateSourcePos
    ,statePosState
    ,mkPos
    ,choice
    ,satisfy
    ,takeWhileP
    ,takeWhile1P
    ,(<?>)
    ,eof
    ,many
    ,try
    ,option
    )
 import Text.Megaparsec.Char
    (string
    ,char
    )
 import Text.Megaparsec.State (initialState)
 import Data.Void (Void)
 import Text.Parsec (option,string,manyTill,anyChar
                   ,try,string,many1,oneOf,digit,(<|>),choice,char,eof
                   ,many,runParser,lookAhead,satisfy
                   ,setPosition,getPosition
                   ,setSourceColumn,setSourceLine
                   ,sourceName, setSourceName
                   ,sourceLine, sourceColumn
                   ,notFollowedBy)
 import Language.SQL.SimpleSQL.Combinators
 import Language.SQL.SimpleSQL.Errors
 import Control.Applicative hiding ((<|>), many)
 import Data.Char
-import Control.Monad
+    (isAlphaNum
-import Prelude hiding (takeWhile)
+    ,isAlpha
-import Text.Parsec.String (Parser)
+    ,isSpace
-import Data.Maybe
+    ,isDigit
    )
 import Control.Monad (void)
 import Data.Text (Text)
 import qualified Data.Text as T
 ------------------------------------------------------------------------------
@ -96,33 +128,33 @@ data Token
    -- * multi char symbols <> \<= \>= != ||
    -- * single char symbols: * + -  < >  ^ / %  ~ & | ? ( ) [ ] , ; ( )
    --
-    = Symbol String
+    = Symbol Text
    -- | This is an identifier or keyword. The first field is
    -- the quotes used, or nothing if no quotes were used. The quotes
    -- can be " or u& or something dialect specific like []
-    | Identifier (Maybe (String,String)) String
+    | Identifier (Maybe (Text,Text)) Text
    -- | This is a prefixed variable symbol, such as :var, @var or #var
    -- (only :var is used in ansi dialect)
-    | PrefixedVariable Char String
+    | PrefixedVariable Char Text
    -- | This is a positional arg identifier e.g. $1
    | PositionalArg Int
    -- | This is a string literal. The first two fields are the --
    -- start and end quotes, which are usually both ', but can be
    -- the character set (one of nNbBxX, or u&, U&), or a dialect
    -- specific string quoting (such as $$ in postgres)
-    | SqlString String String String
+    | SqlString Text Text Text
    -- | A number literal (integral or otherwise), stored in original format
    -- unchanged
-    | SqlNumber String
+    | SqlNumber Text
    -- | Whitespace, one or more of space, tab or newline.
-    | Whitespace String
+    | Whitespace Text
    -- | A commented line using --, contains every character starting with the
    -- \'--\' and including the terminating newline character if there is one
    -- - this will be missing if the last line in the source is a line comment
    -- with no trailing newline
-    | LineComment String
+    | LineComment Text
    -- | A block comment, \/* stuff *\/, includes the comment delimiters
-    | BlockComment String
+    | BlockComment Text
      deriving (Eq,Show)
 ------------------------------------------------------------------------------
@ -131,79 +163,98 @@ data Token
 -- | Lex some SQL to a list of tokens.
 lexSQL :: Dialect
-                  -- ^ dialect of SQL to use
+       -- ^ dialect of SQL to use
-               -> FilePath
+       -> Text
-                  -- ^ filename to use in error messages
+       -- ^ filename to use in error messages
-               -> Maybe (Int,Int)
+       -> Maybe (Int,Int)
-                  -- ^ line number and column number of the first character
+       -- ^ line number and column number of the first character
-                  -- in the source to use in error messages
+       -- in the source to use in error messages
-               -> String
+       -> Text
-                  -- ^ the SQL source to lex
+       -- ^ the SQL source to lex
-               -> Either ParseError [((String,Int,Int),Token)]
+       -> Either ParseError [WithPos Token]
-lexSQL dialect fn' p src =
+lexSQL dialect fn p src = myParse fn p (many (sqlToken dialect) <* (eof <?> "")) src
-    let (l',c') = fromMaybe (1,1) p
+
-    in either (Left . convParseError src) Right
+myParse :: Text -> (Maybe (Int,Int)) -> Parser a -> Text -> Either ParseError a
-       $ runParser (setPos (fn',l',c') *> many (sqlToken dialect) <* eof) () fn' src
+myParse name sp' p s =
-  where
+        let sp = maybe (1,1) id sp'
-    setPos (fn,l,c) = do
+            ps = SourcePos (T.unpack name) (mkPos $ fst sp) (mkPos $ snd sp)
-        fmap (flip setSourceName fn
+            is = (initialState (T.unpack name) s)
-               . flip setSourceLine l
+            sps = (statePosState is) {pstateSourcePos = ps}
-               . flip setSourceColumn c) getPosition
+            is' = is {statePosState = sps}
-          >>= setPosition
+        in snd $ runParser' p is'
 prettyError :: ParseError -> Text
 prettyError = T.pack . errorBundlePretty
 ------------------------------------------------------------------------------
-- pretty printing
+-- parsing boilerplate
 type ParseError = ParseErrorBundle Text Void
 type Parser = Parsec Void Text
 -- | Positional information added to tokens to preserve source positions
 -- for the parser
 data WithPos a = WithPos
  { startPos :: SourcePos
  , endPos :: SourcePos
  , tokenLength :: Int
  , tokenVal :: a
  } deriving (Eq, Ord, Show)
 ------------------------------------------------------------------------------
 -- pretty print
 -- | Pretty printing, if you lex a bunch of tokens, then pretty
 -- print them, should should get back exactly the same string
-prettyToken :: Dialect -> Token -> String
+prettyToken :: Dialect -> Token -> Text
 prettyToken _ (Symbol s) = s
 prettyToken _ (Identifier Nothing t) = t
-prettyToken _ (Identifier (Just (q1,q2)) t) = q1 ++ t ++ q2
+prettyToken _ (Identifier (Just (q1,q2)) t) = q1 <> t <> q2
-prettyToken _ (PrefixedVariable c p) = c:p
+prettyToken _ (PrefixedVariable c p) = T.cons c p
-prettyToken _ (PositionalArg p) = '$':show p
+prettyToken _ (PositionalArg p) = T.cons '$' $ T.pack $ show p
-prettyToken _ (SqlString s e t) = s ++ t ++ e
+prettyToken _ (SqlString s e t) = s <> t <> e
 prettyToken _ (SqlNumber r) = r
 prettyToken _ (Whitespace t) = t
 prettyToken _ (LineComment l) = l
 prettyToken _ (BlockComment c) = c
-prettyTokens :: Dialect -> [Token] -> String
+prettyTokens :: Dialect -> [Token] -> Text
-prettyTokens d ts = concat $ map (prettyToken d) ts
+prettyTokens d ts = T.concat $ map (prettyToken d) ts
 ------------------------------------------------------------------------------
 -- token parsers
 -- | parser for a sql token
-sqlToken :: Dialect -> Parser ((String,Int,Int),Token)
+sqlToken :: Dialect -> Parser (WithPos Token)
 sqlToken d = do
-    p' <- getPosition
+    -- possibly there's a more efficient way of doing the source positions?
-    let p = (sourceName p',sourceLine p', sourceColumn p')
+    sp <- getSourcePos
-
+    off <- getOffset
-{-
+    t <- choice
-The order of parsers is important: strings and quoted identifiers can
+         [sqlString d
-start out looking like normal identifiers, so we try to parse these
+         ,identifier d
-first and use a little bit of try. Line and block comments start like
+         ,lineComment d
-symbols, so we try these before symbol. Numbers can start with a . so
+         ,blockComment d
-this is also tried before symbol (a .1 will be parsed as a number, but
+         ,sqlNumber d
-. otherwise will be parsed as a symbol).
+         ,positionalArg d
-}
+         --,dontParseEndBlockComment d
-
+         ,prefixedVariable d
-    (p,) <$> choice [sqlString d
+         ,symbol d
-                    ,identifier d
+         ,sqlWhitespace d]
-                    ,lineComment d
+    off1 <- getOffset
-                    ,blockComment d
+    ep <- getSourcePos
-                    ,sqlNumber d
+    pure $ WithPos sp ep (off1 - off) t
                    ,positionalArg d
                    ,dontParseEndBlockComment d
                    ,prefixedVariable d
                    ,symbol d
                    ,sqlWhitespace d]
 --------------------------------------
 sqlString :: Dialect -> Parser Token
 sqlString d =
    SqlString "'" "'" <$> (char_ '\'' *> takeWhileP (Just "non quote char") (/= '\'') <* char_ '\'')
 {-
 Parse a SQL string. Examples:
@ -214,7 +265,7 @@ b'binary string'
 x'hexidecimal string'
 -}
-
+{-
 sqlString :: Dialect -> Parser Token
 sqlString d = dollarString <|> csString <|> normalString
  where
@ -259,10 +310,27 @@ sqlString d = dollarString <|> csString <|> normalString
    cs = choice $ (map (\x -> string ([x] ++ "'")) csPrefixes)
                  ++ [string "u&'"
                     ,string "U&'"]
-
+-}
 --------------------------------------
 -- TODO: this reconstitutes the string from bits, instead of lifting
 -- it in one piece from the source. This is a performance issue, not
 -- sure if it will be significant. The same comment applies to most of
 -- the other parsers
 identifier :: Dialect -> Parser Token
 identifier d = Identifier Nothing <$> identifierString d
 identifierString :: Dialect -> Parser Text
 identifierString _ = (do
    c <- satisfy isFirstLetter
    choice
        [T.cons c <$> (takeWhileP (Just "identifier char") isNonFirstLetter)
        ,pure $ T.singleton c]) <?> "identifier"
  where
     isFirstLetter c = c == '_' || isAlpha c
     isNonFirstLetter c = c == '_' || isAlphaNum c
 {-
 Parses identifiers:
@ -273,6 +341,7 @@ u&"unicode quoted identifier"
 `mysql quoted identifier`
 -}
 {-
 identifier :: Dialect -> Parser Token
 identifier d =
    choice
@ -317,9 +386,28 @@ identifierString =
 isIdentifierChar :: Char -> Bool
 isIdentifierChar c = c == '_' || isAlphaNum c
 -}
 --------------------------------------
 {-
 I think it's always faster to use a string locally created in the parser code,
 than to use one taken from the parsed source, unless you take it without modifying it,
 the example here is using -- and \n. this won't be needed in this case if can work out
 how to lift the entire comment as a single string from the source.
 this concept does apply to things like symbols
 -}
 lineComment :: Dialect -> Parser Token
 lineComment _ = do
    try (string_ "--") <?> ""
    rest <- takeWhileP (Just "non newline character") (/='\n')
    -- can you optionally read the \n to terminate the takewhilep without reparsing it?
    suf <- option "" ("\n" <$ char_ '\n')
    pure $ LineComment $ T.concat ["--", rest, suf]
 {-lineComment :: Dialect -> Parser Token
 lineComment _ =
    (\s -> LineComment $ concat ["--",s]) <$>
    -- try is used here in case we see a - symbol
@ -332,16 +420,28 @@ lineComment _ =
    conc a (Just b) = a ++ b
    lineCommentEnd =
        Just "\n" <$ char '\n'
-        <|> Nothing <$ eof
+        <|> Nothing <$ eof-}
 --------------------------------------
 blockComment :: Dialect -> Parser Token
 blockComment _ = (do
    try $ string_ "/*"
    BlockComment . T.concat . ("/*":) <$> more) <?> ""
  where
    more = choice
        [["*/"] <$ try (string_ "*/")
        ,char_ '*' *> (("*":) <$> more)
        ,(:) <$> takeWhile1P (Just "non comment terminator text") (/= '*') <*> more]
 {-
 Try is used in the block comment for the two symbol bits because we
 want to backtrack if we read the first symbol but the second symbol
 isn't there.
 -}
-
+{-
 blockComment :: Dialect -> Parser Token
 blockComment _ =
    (\s -> BlockComment $ concat ["/*",s]) <$>
@ -361,7 +461,7 @@ blockComment _ =
             ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
              -- not an end comment or nested comment, continue
             ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
-
+-}
 {-
 This is to improve user experience: provide an error if we see */
@ -370,14 +470,22 @@ in them (which is a stupid thing to do). In other cases, the user
 should write * / instead (I can't think of any cases when this would
 be valid syntax though).
 -}
-
+{-
 dontParseEndBlockComment :: Dialect -> Parser Token
 dontParseEndBlockComment _ =
    -- don't use try, then it should commit to the error
    try (string "*/") *> fail "comment end without comment start"
-
+-}
 --------------------------------------
 sqlNumber :: Dialect -> Parser Token
 sqlNumber _ =
    SqlNumber <$> digits
 digits :: Parser Text
 digits = takeWhile1P (Just "digit") isDigit
 {-
 numbers
@ -396,6 +504,7 @@ considered part of the constant; it is an operator applied to the
 constant.
 -}
 {-
 sqlNumber :: Dialect -> Parser Token
 sqlNumber d =
    SqlNumber <$> completeNumber
@ -427,20 +536,34 @@ sqlNumber d =
    expon = (:) <$> oneOf "eE" <*> sInt
    sInt = (++) <$> option "" (string "+" <|> string "-") <*> int
    pp = (<$$> (++))
 -}
 --------------------------------------
 positionalArg :: Dialect -> Parser Token
 positionalArg _ = PositionalArg <$> (char_ '$' *> (read . T.unpack <$> digits))
 {-
 positionalArg :: Dialect -> Parser Token
 positionalArg d =
  guard (diPositionalArg d) >>
  -- use try to avoid ambiguities with other syntax which starts with dollar
  PositionalArg <$> try (char '$' *> (read <$> many1 digit))
-
+-}
 --------------------------------------
 -- todo: I think the try here should read a prefix char, then a single valid
 -- identifier char, then commit
 prefixedVariable :: Dialect -> Parser Token
 prefixedVariable d = try $ choice
    [PrefixedVariable <$> (':' <$ char_ ':') <*> identifierString d
    ]
 -- use try because : and @ can be part of other things also
 {-
 prefixedVariable :: Dialect -> Parser Token
 prefixedVariable  d = try $ choice
    [PrefixedVariable <$> char ':' <*> identifierString
@ -449,9 +572,21 @@ prefixedVariable  d = try $ choice
    ,guard (diHashIdentifier d) >>
     PrefixedVariable <$> char '#' <*> identifierString
    ]
 -}
 --------------------------------------
 symbol :: Dialect -> Parser Token
 symbol _ =
    Symbol <$> choice
    [try $ choice $ map (\a -> string a) multiCharSymbols
    ,T.singleton <$> satisfy (`elem` singleLetterSymbol)
    ]
  where
    singleLetterSymbol = "(),-+*/<>=." :: String
    multiCharSymbols = ["!=", "<>", ">=", "<=", "||"]
 {-
 Symbols
@ -461,7 +596,7 @@ A symbol is an operator, or one of the misc symbols which include:
 The postgresql operator syntax allows a huge range of operators
 compared with ansi and other dialects
 -}
-
+{-
 symbol :: Dialect -> Parser Token
 symbol d  = Symbol <$> choice (concat
   [dots
@ -576,16 +711,27 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
             oneOf "<>=")
        <*> option [] opMoreChars
       ]
 -}
 --------------------------------------
 sqlWhitespace :: Dialect -> Parser Token
-sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
+sqlWhitespace _ = Whitespace <$> takeWhile1P (Just "whitespace") isSpace <?> ""
 --sqlWhitespace :: Dialect -> Parser Token
 --sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
 ----------------------------------------------------------------------------
 -- parser helpers
 char_ :: Char -> Parser ()
 char_ = void . char
 string_ :: Text -> Parser ()
 string_ = void . string
 {-
 startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
 startsWith p ps = do
  c <- satisfy p
@ -603,6 +749,7 @@ takeTill p = manyTill anyChar (peekSatisfy p)
 peekSatisfy :: (Char -> Bool) -> Parser ()
 peekSatisfy p = void $ lookAhead (satisfy p)
 -}
 ----------------------------------------------------------------------------
@ -629,7 +776,8 @@ successes. I don't think it succeeds this test at the moment
 -- will pretty print then lex back to the same set of tokens.
 -- Used internally, might be useful for generating SQL via lexical tokens.
 tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool
-tokenListWillPrintAndLex _ [] = True
+tokenListWillPrintAndLex = undefined
 {-tokenListWillPrintAndLex _ [] = True
 tokenListWillPrintAndLex _ [_] = True
 tokenListWillPrintAndLex d (a:b:xs) =
    tokensWillPrintAndLex d a b && tokenListWillPrintAndLex d (b:xs)
@ -770,3 +918,5 @@ TODO: not 100% on this always being bad
    checkLastAChar f = case prettya of
                          (_:_) -> f $ last prettya
                          _ -> False
 -}
--- a/Language/SQL/SimpleSQL/Parse.hs
+++ b/Language/SQL/SimpleSQL/Parse.hs
@ -281,7 +281,7 @@ wrapParse :: Parser a
          -> Maybe (Int,Int)
          -> String
          -> Either ParseError a
-wrapParse parser d f p src = do
+wrapParse parser d f p src = undefined {-do
    let (l,c) = fromMaybe (1,1) p
    lx <- L.lexSQL d f (Just (l,c)) src
    either (Left . convParseError src) Right
@ -294,7 +294,7 @@ wrapParse parser d f p src = do
    keep (_,L.Whitespace {}) = False
    keep (_,L.LineComment {}) = False
    keep (_,L.BlockComment {}) = False
-    keep _ = True
+    keep _ = True-}
 {-
@ -2084,16 +2084,16 @@ keyword matching
 -}
 stringTok :: Parser (String,String,String)
-stringTok = mytoken (\tok ->
+stringTok = undefined {-mytoken (\tok ->
    case tok of
      L.SqlString s e t -> Just (s,e,t)
-      _ -> Nothing)
+      _ -> Nothing)-}
 singleQuotesOnlyStringTok :: Parser String
-singleQuotesOnlyStringTok = mytoken (\tok ->
+singleQuotesOnlyStringTok = undefined {-mytoken (\tok ->
    case tok of
      L.SqlString "'" "'" t -> Just t
-      _ -> Nothing)
+      _ -> Nothing)-}
 {-
 This is to support SQL strings where you can write
@ -2104,7 +2104,7 @@ It is only allowed when all the strings are quoted with ' atm.
 -}
 stringTokExtend :: Parser (String,String,String)
-stringTokExtend = do
+stringTokExtend = undefined {-do
    (s,e,x) <- stringTok
    choice [
         do
@ -2113,48 +2113,48 @@ stringTokExtend = do
         guard (s' == "'" && e' == "'")
         return $ (s,e,x ++ y)
        ,return (s,e,x)
-        ]
+        ]-}
 hostParamTok :: Parser String
-hostParamTok = mytoken (\tok ->
+hostParamTok = undefined {-mytoken (\tok ->
    case tok of
      L.PrefixedVariable c p -> Just (c:p)
-      _ -> Nothing)
+      _ -> Nothing)-}
 positionalArgTok :: Parser Int
-positionalArgTok = mytoken (\tok ->
+positionalArgTok = undefined {-mytoken (\tok ->
    case tok of
      L.PositionalArg p -> Just p
-      _ -> Nothing)
+      _ -> Nothing)-}
 sqlNumberTok :: Bool -> Parser String
-sqlNumberTok intOnly = mytoken (\tok ->
+sqlNumberTok intOnly = undefined {-mytoken (\tok ->
    case tok of
      L.SqlNumber p | not intOnly || all isDigit p -> Just p
-      _ -> Nothing)
+      _ -> Nothing)-}
 symbolTok :: Maybe String -> Parser String
-symbolTok sym = mytoken (\tok ->
+symbolTok sym = undefined {-mytoken (\tok ->
    case (sym,tok) of
      (Nothing, L.Symbol p) -> Just p
      (Just s, L.Symbol p) | s == p -> Just p
-      _ -> Nothing)
+      _ -> Nothing)-}
 identifierTok :: [String] -> Parser (Maybe (String,String), String)
-identifierTok blackList = mytoken (\tok ->
+identifierTok blackList = undefined {-mytoken (\tok ->
    case tok of
      L.Identifier q@(Just {}) p -> Just (q,p)
      L.Identifier q p | map toLower p `notElem` blackList -> Just (q,p)
-      _ -> Nothing)
+      _ -> Nothing)-}
 unquotedIdentifierTok :: [String] -> Maybe String -> Parser String
-unquotedIdentifierTok blackList kw = mytoken (\tok ->
+unquotedIdentifierTok blackList kw = undefined {-mytoken (\tok ->
    case (kw,tok) of
      (Nothing, L.Identifier Nothing p) | map toLower p `notElem` blackList -> Just p
      (Just k, L.Identifier Nothing p) | k == map toLower p -> Just p
-      _ -> Nothing)
+      _ -> Nothing)-}
 mytoken :: (L.Token -> Maybe a) -> Parser a
 mytoken test = token showToken posToken testToken
--- a/simple-sql-parser.cabal
+++ b/simple-sql-parser.cabal
@ -39,7 +39,9 @@ Flag fixitytest
 common shared-properties
  default-language:    Haskell2010
  build-depends:       base >=4 && <5,
-                       parsec >=3.1 && <3.2,
+                       megaparsec >=9.6 && <9.7, 
                       parser-combinators >= 1.3 && < 1.4,
                       parsec,
                       mtl >=2.1 && <2.4,
                       prettyprinter >= 1.7 && < 1.8,
                       text  >= 2.1 && < 2.2
--- a/tools/Language/SQL/SimpleSQL/LexerTests.hs
+++ b/tools/Language/SQL/SimpleSQL/LexerTests.hs
@ -2,23 +2,89 @@
 -- Test for the lexer
 {-
 TODO:
 figure out a way to do quickcheck testing:
 1. generate valid tokens and check they parse
 2. combine two generated tokens together for the combo testing
 this especially will work much better for the postgresql extensible
 operator tests which doing exhaustively takes ages and doesn't bring
 much benefit over testing a few using quickcheck.
 -}
 {-# LANGUAGE OverloadedStrings #-}
 module Language.SQL.SimpleSQL.LexerTests (lexerTests) where
 import Language.SQL.SimpleSQL.TestTypes
-import Language.SQL.SimpleSQL.Lex (Token(..),tokenListWillPrintAndLex)
+import Language.SQL.SimpleSQL.Lex
    (Token(..)
    ,tokenListWillPrintAndLex
    )
 import Language.SQL.SimpleSQL.Dialect
    (ansi2011)
 import qualified Data.Text as T
 --import Debug.Trace
 --import Data.Char (isAlpha)
-import Data.List
+-- import Data.List
 lexerTests :: TestItem
 lexerTests = Group "lexerTests" $
-    [Group "lexer token tests" [ansiLexerTests
+    [bootstrapTests{-Group "lexer token tests" [ansiLexerTests
                               ,postgresLexerTests
                               ,sqlServerLexerTests
                               ,oracleLexerTests
                               ,mySqlLexerTests
-                               ,odbcLexerTests]]
+                               ,odbcLexerTests]-}]
 -- quick sanity tests to see something working
 bootstrapTests :: TestItem
 bootstrapTests = Group "bootstrap tests" $
    map (uncurry (LexTest ansi2011)) (
    [("iden", [Identifier Nothing "iden"])
    ,("'string'", [SqlString "'" "'" "string"])
    ,("  ", [Whitespace "  "])
    ,("\t  ", [Whitespace "\t  "])
    ,("  \n  ", [Whitespace "  \n  "])
    ,("--", [LineComment "--"])
    ,("--\n", [LineComment "--\n"])
    ,("--stuff", [LineComment "--stuff"])
    ,("-- stuff", [LineComment "-- stuff"])
    ,("-- stuff\n", [LineComment "-- stuff\n"])
    ,("--\nstuff", [LineComment "--\n", Identifier Nothing "stuff"])
    ,("-- com \nstuff", [LineComment "-- com \n", Identifier Nothing "stuff"])
    ,("/*test1*/", [BlockComment "/*test1*/"])
    ,("/**/", [BlockComment "/**/"])
    ,("/***/", [BlockComment "/***/"])
    ,("/* * */", [BlockComment "/* * */"])
    ,("/*test*/", [BlockComment "/*test*/"])
    ,("/*te/*st*/", [BlockComment "/*te/*st*/"])
    ,("/*te*st*/", [BlockComment "/*te*st*/"])
    ,("/*lines\nmore lines*/", [BlockComment "/*lines\nmore lines*/"])
    ,("/*test1*/\n", [BlockComment "/*test1*/", Whitespace "\n"])
    ,("/*test1*/stuff", [BlockComment "/*test1*/", Identifier Nothing "stuff"])
    ,("1", [SqlNumber "1"])
    ,("42", [SqlNumber "42"])
    ,("$1", [PositionalArg 1])
    ,("$200", [PositionalArg 200])
    ,(":test", [PrefixedVariable ':' "test"])
    ] ++ map (\a -> (a, [Symbol a])) (
     ["!=", "<>", ">=", "<=", "||"]
     ++ map T.singleton ("(),-+*/<>=." :: String)))
 {-
 ansiLexerTable :: [(String,[Token])]
 ansiLexerTable =
    -- single char symbols
@ -331,13 +397,4 @@ combos :: [a] -> Int -> [[a]]
 combos _ 0 = [[]]
 combos l n = [ x:tl | x <- l, tl <- combos l (n - 1) ]
 {-
 figure out a way to do quickcheck testing:
 1. generate valid tokens and check they parse
 2. combine two generated tokens together for the combo testing
 this especially will work much better for the postgresql extensible
 operator tests which doing exhaustively takes ages and doesn't bring
 much benefit over testing a few using quickcheck.
 -}
--- a/tools/Language/SQL/SimpleSQL/TestTypes.hs
+++ b/tools/Language/SQL/SimpleSQL/TestTypes.hs
@ -13,6 +13,8 @@ import Language.SQL.SimpleSQL.Syntax
 import Language.SQL.SimpleSQL.Lex (Token)
 import Language.SQL.SimpleSQL.Dialect
 import Data.Text (Text)
 {-
 TODO: maybe make the dialect args into [dialect], then each test
 checks all the dialects mentioned work, and all the dialects not
@ -38,6 +40,6 @@ should all be TODO to convert to a testqueryexpr test.
              | ParseQueryExprFails Dialect String
              | ParseScalarExprFails Dialect String
-              | LexTest Dialect String [Token]
+              | LexTest Dialect Text [Token]
              | LexFails Dialect String
                deriving (Eq,Show)
--- a/tools/Language/SQL/SimpleSQL/Tests.hs
+++ b/tools/Language/SQL/SimpleSQL/Tests.hs
@ -5,6 +5,7 @@ Test.Framework tests. It also contains the code which converts the
 test data to the Test.Framework tests.
 -}
 {-# LANGUAGE OverloadedStrings #-}
 module Language.SQL.SimpleSQL.Tests
    (testData
    ,tests
@ -17,7 +18,7 @@ import qualified Test.Tasty.HUnit as H
 --import Language.SQL.SimpleSQL.Syntax
 import Language.SQL.SimpleSQL.Pretty
 import Language.SQL.SimpleSQL.Parse
-import Language.SQL.SimpleSQL.Lex
+import qualified Language.SQL.SimpleSQL.Lex as Lex
 import Language.SQL.SimpleSQL.TestTypes
@ -44,6 +45,9 @@ import Language.SQL.SimpleSQL.MySQL
 import Language.SQL.SimpleSQL.Oracle
 import Language.SQL.SimpleSQL.CustomDialect
 import Data.Text (Text)
 import qualified Data.Text as T
 {-
 Order the tests to start from the simplest first. This is also the
@ -54,7 +58,7 @@ testData :: TestItem
 testData =
    Group "parserTest"
    [lexerTests
-    ,scalarExprTests
+    {-,scalarExprTests
    ,odbcTests
    ,queryExprComponentTests
    ,queryExprsTests
@ -72,7 +76,7 @@ testData =
    ,oracleTests
    ,customDialectTests
    ,emptyStatementTests
-    ,createIndexTests
+    ,createIndexTests-}
    ]
 tests :: T.TestTree
@ -104,18 +108,19 @@ itemToTest (ParseScalarExprFails d str) =
 itemToTest (LexTest d s ts) = makeLexerTest d s ts
 itemToTest (LexFails d s) = makeLexingFailsTest d s
-makeLexerTest :: Dialect -> String -> [Token] -> T.TestTree
+makeLexerTest :: Dialect -> Text -> [Lex.Token] -> T.TestTree
-makeLexerTest d s ts = H.testCase s $ do
+makeLexerTest d s ts = H.testCase (T.unpack s) $ do
-    let lx = either (error . show) id $ lexSQL d "" Nothing s
+    let lx = either (error . T.unpack . Lex.prettyError) id $ Lex.lexSQL d "" Nothing s
-    H.assertEqual "" ts $ map snd lx
+        ts1 = map Lex.tokenVal lx
-    let s' = prettyTokens d $ map snd lx
+    H.assertEqual "" ts ts1
    let s' = Lex.prettyTokens d $ ts1
    H.assertEqual "pretty print" s s'
 makeLexingFailsTest :: Dialect -> String -> T.TestTree
 makeLexingFailsTest d s = H.testCase s $ do
-    case lexSQL d "" Nothing s of
+    undefined {-case lexSQL d "" Nothing s of
         Right x -> H.assertFailure $ "lexing should have failed: " ++ s ++ "\ngot: " ++ show x
-         Left _ -> return ()
+         Left _ -> return ()-}
 toTest :: (Eq a, Show a) =>