add separate lexer

2015-08-01 00:04:18 +03:00 · 2015-08-01 00:04:18 +03:00 · 2df76e3095
commit 2df76e3095
parent 1364c58534
7 changed files with 639 additions and 182 deletions
--- a/Language/SQL/SimpleSQL/Combinators.lhs
+++ b/Language/SQL/SimpleSQL/Combinators.lhs
@ -17,9 +17,8 @@

 > import Control.Applicative ((<$>), (<*>), (<**>), pure, Applicative)
 > import Text.Parsec (option,many)
-> import Text.Parsec.Prim (Parsec)
+> import Text.Parsec.String (GenParser)

-> type Parser s = Parsec String s

 a possible issue with the option suffix is that it enforces left
 associativity when chaining it recursively. Have to review
@ -29,7 +28,7 @@ instead, and create an alternative suffix parser
 This function style is not good, and should be replaced with chain and
 <??> which has a different type

-> optionSuffix :: (a -> Parser s a) -> a -> Parser s a
+> optionSuffix :: (a -> GenParser t s a) -> a -> GenParser t s a
 > optionSuffix p a = option a (p a)


@ -39,7 +38,7 @@ hand result, taken from uu-parsinglib
 TODO: make sure the precedence higher than <|> and lower than the
 other operators so it can be used nicely

-> (<??>) :: Parser s a -> Parser s (a -> a) -> Parser s a
+> (<??>) :: GenParser t s a -> GenParser t s (a -> a) -> GenParser t s a
 > p <??> q = p <**> option id q


@ -80,7 +79,7 @@ composing suffix parsers, not sure about the name. This is used to add
 a second or more suffix parser contingent on the first suffix parser
 succeeding.

-> (<??.>) :: Parser s (a -> a) -> Parser s (a -> a) -> Parser s (a -> a)
+> (<??.>) :: GenParser t s (a -> a) -> GenParser t s (a -> a) -> GenParser t s (a -> a)
 > (<??.>) pa pb = (.) `c` pa <*> option id pb
 >   -- todo: fix this mess
 >   where c = (<$>) . flip
@ -88,7 +87,7 @@ succeeding.

 0 to many repeated applications of suffix parser

-> (<??*>) :: Parser s a -> Parser s (a -> a) -> Parser s a
+> (<??*>) :: GenParser t s a -> GenParser t s (a -> a) -> GenParser t s a
 > p <??*> q = foldr ($) <$> p <*> (reverse <$> many q)


--- a/Language/SQL/SimpleSQL/Lexer.lhs
+++ b/Language/SQL/SimpleSQL/Lexer.lhs
@ -0,0 +1,296 @@
+
+Lexer TODO:
+
+left factor to get rid of trys
+
+> -- | This is the module contains a Lexer for SQL.
+> {-# LANGUAGE TupleSections #-}
+> module Language.SQL.SimpleSQL.Lexer
+>     (lexSQL
+>     ,Token(..)
+>     ,prettyToken
+>     ,prettyTokens
+>     ,Position
+>     ,ParseError(..)
+>     ,Dialect(..)) where
+
+> import Language.SQL.SimpleSQL.Syntax (Dialect(..))
+
+> import Text.Parsec (option,string,manyTill,anyChar
+>                    ,try,string,many1,oneOf,digit,(<|>),choice,char,eof
+>                    ,many,runParser,lookAhead,satisfy)
+> import Language.SQL.SimpleSQL.Combinators
+> import Language.SQL.SimpleSQL.Errors
+> import Control.Applicative hiding ((<|>), many)
+> import Data.Char
+> import Control.Monad
+> import Prelude hiding (takeWhile)
+> import Text.Parsec.String (Parser)
+
+
+> -- | Represents a lexed token
+> data Token
+>     -- | a symbol is one of the following
+>     -- * multi char symbols <> <= >= != ||
+>     -- * single char symbols: * + -  < >  ^ / %  ~ & | ? ( ) [ ] , ; ( )
+>     --
+>     = Symbol String
+>
+>     -- | This is an identifier or keyword.
+>     --
+>     | Identifier String
+>
+>     -- | This is an identifier quoted with "
+>     | QIdentifier String
+>     -- | This is an identifier quoted with u&"
+>     | UQIdentifier String
+
+>     -- | This is a dialect specific quoted identifier with the quote
+>     -- characters explicit. The first and second fields are the
+>     -- starting and ending quote characters.n
+>     | DQIdentifier String String String
+>
+>     -- | This is a host param symbol, e.g. :param
+>     | HostParam String
+>
+>     -- | This is a string literal.
+>     | SqlString String
+>
+>     -- | This is a character set string literal. The first field is
+>     -- the charatecter set (one of nNbBxX).
+>     | CSSqlString String String
+>
+>     -- | a number literal (integral or otherwise), stored in original format
+>     -- unchanged
+>     | SqlNumber String
+>
+>     -- | non-significant whitespace (space, tab, newline) (strictly speaking,
+>     -- it is up to the client to decide whether the whitespace is significant
+>     -- or not)
+>     | Whitespace String
+>
+>     -- | a commented line using --, contains every character starting with the
+>     -- \'--\' and including the terminating newline character if there is one
+>     -- - this will be missing if the last line in the source is a line comment
+>     -- with no trailing newline
+>     | LineComment String
+>
+>     -- | a block comment, \/* stuff *\/, includes the comment delimiters
+>     | BlockComment String
+>
+>       deriving (Eq,Show)
+
+
+
+> -- | Accurate pretty printing, if you lex a bunch of tokens,
+> -- then pretty print them, should should get back exactly the
+> -- same string
+> prettyToken :: Dialect -> Token -> String
+> prettyToken _ (Symbol s) = s
+> prettyToken _ (Identifier t) = t
+> prettyToken _ (QIdentifier t) =
+>     "\"" ++ doubleChars '"' t ++ "\""
+> prettyToken _ (UQIdentifier t) =
+>     "u&\"" ++ doubleChars '"' t ++ "\""
+> prettyToken _ (DQIdentifier s e t) =
+>     s ++ t ++ e
+> prettyToken _ (HostParam p) = ':':p
+> prettyToken _ (SqlString t) = "'" ++ doubleChars '\'' t ++ "'"
+> prettyToken _ (CSSqlString cs t) = cs ++ "'" ++ t ++ "'"
+> prettyToken _ (SqlNumber r) = r
+> prettyToken _ (Whitespace t) = t
+> prettyToken _ (LineComment l) = l
+> prettyToken _ (BlockComment c) = c
+
+> prettyTokens :: Dialect -> [Token] -> String
+> prettyTokens d ts = concat $ map (prettyToken d) ts
+
+> doubleChars :: Char -> String -> String
+> doubleChars _ [] = []
+> doubleChars c (d:ds) | c == d = c:d:doubleChars c ds
+>                      | otherwise = d:doubleChars c ds
+
+TODO: try to make all parsers applicative only
+
+> type Position = (String,Int,Int)
+
+> addPosition :: Position -> String -> Position
+> addPosition = addPosition'
+
+> addPosition' :: Position -> String -> Position
+> addPosition' (f,l,c) [] = (f,l,c)
+> addPosition' (f,l,_) ('\n':xs) = addPosition' (f,l+1,0) xs
+> addPosition' (f,l,c) (_:xs) = addPosition' (f,l,c+1) xs
+
+
+
+> lexSQL :: Dialect -> Position -> String -> Either ParseError [(Position,Token)]
+> lexSQL dialect pos@(fn,_,_) txt =
+>     either (Left . convParseError fn) Right
+>     $ runParser (many_p pos <* eof) () "" txt
+>   where
+
+>      many_p pos' = some_p pos' `mplus` return []
+>      some_p pos' = do
+>        tok <- sqlToken dialect pos'
+>        let pos'' = advancePos dialect pos' (snd tok)
+>        (tok:) <$> many_p pos''
+
+> advancePos :: Dialect -> Position -> Token -> Position
+> advancePos dialect pos tok =
+>     let pt = prettyToken dialect tok
+>     in addPosition pos pt
+
+> -- | parser for a sql token
+> sqlToken :: Dialect -> Position -> Parser (Position,Token)
+> sqlToken d p =
+>     (p,) <$> choice [sqlString d
+>                     ,identifier d
+>                     ,hostParam d
+>                     ,lineComment d
+>                     ,blockComment d
+>                     ,sqlNumber d
+>                     ,symbol d
+>                     ,sqlWhitespace d]
+
+> takeWhile1 :: (Char -> Bool) -> Parser String
+> takeWhile1 p = many1 (satisfy p)
+
+> takeWhile :: (Char -> Bool) -> Parser String
+> takeWhile p = many (satisfy p)
+
+> takeTill :: (Char -> Bool) -> Parser String
+> takeTill p =
+>     try (manyTill anyChar (peekSatisfy p))
+
+> peekSatisfy :: (Char -> Bool) -> Parser ()
+> peekSatisfy p = do
+>     void $ try $ lookAhead (satisfy p)
+
+> identifier :: Dialect -> Parser Token
+> identifier d =
+>     choice
+>     [QIdentifier <$> qiden
+>     ,UQIdentifier <$> ((try (string "u&" <|> string "U&")) *> qiden)
+>     ,Identifier <$> identifierString
+>     ,DQIdentifier "`" "`" <$> mySqlQIden
+>     ]
+>   where
+>     qiden = char '"' *> qidenSuffix ""
+>     qidenSuffix t = do
+>         s <- takeTill (=='"')
+>         void $ char '"'
+>         -- deal with "" as literal double quote character
+>         choice [do
+>                 void $ char '"'
+>                 qidenSuffix $ concat [t,s,"\""]
+>                ,return $ concat [t,s]]
+>     -- mysql can quote identifiers with `
+>     mySqlQIden = do
+>         guard (d == MySQL)
+>         char '`' *> takeWhile1 (/='`') <* char '`'
+
+> identifierString :: Parser String
+> identifierString =
+>     startsWith (\c -> c == '_' || isAlpha c)
+>                (\c -> c == '_' || isAlphaNum c)
+
+
+> sqlString :: Dialect -> Parser Token
+> sqlString _ =
+>     choice [csString
+>            ,normalString
+>            ]
+>   where
+>     normalString = SqlString {-"'"-} <$> (char '\'' *> normalStringSuffix "")
+>     normalStringSuffix t = do
+>         s <- takeTill (=='\'')
+>         void $ char '\''
+>         -- deal with '' as literal quote character
+>         choice [do
+>                 void $ char '\''
+>                 normalStringSuffix $ concat [t,s,"'"]
+>                ,return $ concat [t,s]]
+>     csString = CSSqlString <$> try (cs  <* char '\'') <*> normalStringSuffix ""
+>     cs = choice [(:[]) <$> oneOf "nNbBxX"
+>                 ,string "u&"
+>                 ,string "U&"]
+
+> hostParam :: Dialect -> Parser Token
+> hostParam _ = HostParam <$> (char ':' *> identifierString)
+
+
+> sqlNumber :: Dialect -> Parser Token
+> sqlNumber _ = SqlNumber <$>
+>     (int <??> (pp dot <??.> pp int)
+>      <|> try ((++) <$> dot <*> int))
+>     <??> pp expon
+>   where
+>     int = many1 digit
+>     dot = string "."
+>     expon = (:) <$> oneOf "eE" <*> sInt
+>     sInt = (++) <$> option "" (string "+" <|> string "-") <*> int
+>     pp = (<$$> (++))
+
+digits
+digits.[digits][e[+-]digits]
+[digits].digits[e[+-]digits]
+digitse[+-]digits
+
+where digits is one or more decimal digits (0 through 9). At least one
+digit must be before or after the decimal point, if one is used. At
+least one digit must follow the exponent marker (e), if one is
+present. There cannot be any spaces or other characters embedded in
+the constant. Note that any leading plus or minus sign is not actually
+considered part of the constant; it is an operator applied to the
+constant.
+
+
+A symbol is one of the two character symbols, or one of the single
+character symbols in the two lists below.
+
+> symbol :: Dialect -> Parser Token
+> symbol _ = Symbol <$> choice (many1 (char '.') :
+>                  map (try . string) [">=","<=","!=","<>","||"]
+>                  ++ map (string . (:[])) "+-^*/%~&|?<>[]=,;()")
+
+> sqlWhitespace :: Dialect -> Parser Token
+> sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
+
+> lineComment :: Dialect -> Parser Token
+> lineComment _ =
+>     (\s -> LineComment $ concat ["--",s]) <$>
+>     (try (string "--") *> choice
+>                     [flip snoc '\n' <$> takeTill (=='\n') <* char '\n'
+>                     ,takeWhile (/='\n') <* eof
+>                     ])
+>   where
+>     snoc :: String -> Char -> String
+>     snoc s a = s ++ [a]
+
+> blockComment :: Dialect -> Parser Token
+> blockComment _ =
+>     (\s -> BlockComment $ concat ["/*",s]) <$>
+>     (try (string "/*") *> commentSuffix 0)
+>   where
+>     commentSuffix :: Int -> Parser String
+>     commentSuffix n = do
+>       -- read until a possible end comment or nested comment
+>       x <- takeWhile (\e -> e /= '/' && e /= '*')
+>       choice [-- close comment: if the nesting is 0, done
+>               -- otherwise recurse on commentSuffix
+>               try (string "*/") *> let t = concat [x,"*/"]
+>                                    in if n == 0
+>                                       then return t
+>                                       else (\s -> concat [t,s]) <$> commentSuffix (n - 1)
+>               -- nested comment, recurse
+>              ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
+>               -- not an end comment or nested comment, continue
+>              ,(:) <$> anyChar <*> commentSuffix n]
+
+
+> startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
+> startsWith p ps = do
+>   c <- satisfy p
+>   choice [(:) c <$> (takeWhile1 ps)
+>          ,return [c]]
--- a/Language/SQL/SimpleSQL/Parser.lhs
+++ b/Language/SQL/SimpleSQL/Parser.lhs
@ -47,7 +47,7 @@ isn't done because of the impact on the parser error
 messages. Apparently it can also help the speed but this hasn't been
 looked into.

-== Parser rrror messages
+== Parser error messages

 A lot of care has been given to generating good parser error messages
 for invalid syntax. There are a few utils below which partially help
@ -185,22 +185,25 @@ fixing them in the syntax but leaving them till the semantic checking
 > import Control.Monad.Identity (Identity)
 > import Control.Monad (guard, void, when)
 > import Control.Applicative ((<$), (<$>), (<*>) ,(<*), (*>), (<**>), pure)
-> import Data.Maybe (catMaybes)
-> import Data.Char (toLower)
+> import Data.Char (toLower, isDigit)
 > import Text.Parsec (setPosition,setSourceColumn,setSourceLine,getPosition
->                    ,option,between,sepBy,sepBy1,string,manyTill,anyChar
->                    ,try,string,many1,oneOf,digit,(<|>),choice,char,eof
->                    ,optionMaybe,optional,many,letter,runParser
->                    ,chainl1, chainr1,(<?>) {-,notFollowedBy,alphaNum-}, lookAhead)
+>                    ,option,between,sepBy,sepBy1
+>                    ,try,many1,(<|>),choice,eof
+>                    ,optionMaybe,optional,runParser
+>                    ,chainl1, chainr1,(<?>))
 > -- import Text.Parsec.String (Parser)
 > import Text.Parsec.Perm (permute,(<$?>), (<|?>))
-> import Text.Parsec.Prim (Parsec, getState)
+> import Text.Parsec.Prim (getState, token)
+> import Text.Parsec.Pos (newPos)
 > import qualified Text.Parsec.Expr as E
 > import Data.List (intercalate,sort,groupBy)
 > import Data.Function (on)
 > import Language.SQL.SimpleSQL.Syntax
 > import Language.SQL.SimpleSQL.Combinators
 > import Language.SQL.SimpleSQL.Errors
+> import qualified Language.SQL.SimpleSQL.Lexer as L
+> import Data.Maybe
+> import Text.Parsec.String (GenParser)

 = Public API

@ -257,14 +260,21 @@ converts the error return to the nice wrapper
 >           -> Maybe (Int,Int)
 >           -> String
 >           -> Either ParseError a
-> wrapParse parser d f p src =
+> wrapParse parser d f p src = do
+>     let (l,c) = fromMaybe (1,0) p
+>     lx <- L.lexSQL d (f,l,c) src
 >     either (Left . convParseError src) Right
->     $ runParser (setPos p *> whitespace *> parser <* eof)
->                 d f src
+>       $ runParser (setPos p *> parser <* eof)
+>                   d f $ filter keep lx
 >   where
 >     setPos Nothing = pure ()
 >     setPos (Just (l,c)) = fmap up getPosition >>= setPosition
 >       where up = flip setSourceColumn c . flip setSourceLine l
+>     keep (_,L.Whitespace {}) = False
+>     keep (_,L.LineComment {}) = False
+>     keep (_,L.BlockComment {}) = False
+>     keep _ = True
+

 ------------------------------------------------

@ -304,15 +314,11 @@ u&"example quoted"
 > name :: Parser Name
 > name = do
 >     d <- getState
->     choice [QName <$> quotedIdentifier
->            ,UQName <$> uquotedIdentifier
+>     choice [QName <$> qidentifierTok
+>            ,UQName <$> uqidentifierTok
 >            ,Name <$> identifierBlacklist (blacklist d)
->            ,dqName]
->   where
->     dqName = guardDialect [MySQL] *>
->              lexeme (DQName "`" "`"
->                      <$> (char '`'
->                           *> manyTill anyChar (char '`')))
+>            ,(\(s,e,t) -> DQName s e t) <$> dqidentifierTok
+>            ]

 todo: replace (:[]) with a named function all over

@ -433,7 +439,7 @@ Unfortunately, to improve the error messages, there is a lot of (left)
 factoring in this function, and it is a little dense.

 > typeName :: Parser TypeName
-> typeName = lexeme $
+> typeName =
 >     (rowTypeName <|> intervalTypeName <|> otherTypeName)
 >     <??*> tnSuffix
 >   where
@ -536,20 +542,13 @@ factoring in this function, and it is a little dense.
 See the stringToken lexer below for notes on string literal syntax.

 > stringLit :: Parser ValueExpr
-> stringLit = StringLit <$> stringToken
+> stringLit = StringLit <$> stringTokExtend

 > numberLit :: Parser ValueExpr
-> numberLit = NumLit <$> numberLiteral
+> numberLit = NumLit <$> sqlNumberTok

 > characterSetLit :: Parser ValueExpr
-> characterSetLit =
->     CSStringLit <$> shortCSPrefix <*> stringToken
->   where
->     shortCSPrefix = try $ choice
->         [(:[]) <$> oneOf "nNbBxX"
->         ,string "u&"
->         ,string "U&"
->         ] <* lookAhead quote
+> characterSetLit = uncurry CSStringLit <$> csSqlStringLitTok

 > simpleLiteral :: Parser ValueExpr
 > simpleLiteral = numberLit <|> stringLit <|> characterSetLit
@ -575,8 +574,8 @@ select x from t where x > :param
 > parameter = choice
 >     [Parameter <$ questionMark
 >     ,HostParameter
->      <$> hostParameterToken
->      <*> optionMaybe (keyword "indicator" *> hostParameterToken)]
+>      <$> hostParamTok
+>      <*> optionMaybe (keyword "indicator" *> hostParamTok)]

 == parens

@ -675,7 +674,7 @@ this. also fix the monad -> applicative
 > intervalLit = try (keyword_ "interval" >> do
 >     s <- optionMaybe $ choice [True <$ symbol_ "+"
 >                               ,False <$ symbol_ "-"]
->     lit <- stringToken
+>     lit <- stringTok
 >     q <- optionMaybe intervalQualifier
 >     mkIt s lit q)
 >   where
@ -701,7 +700,7 @@ all the value expressions which start with an identifier
 > idenExpr :: Parser ValueExpr
 > idenExpr =
 >     -- todo: work out how to left factor this
->     try (TypedLit <$> typeName <*> stringToken)
+>     try (TypedLit <$> typeName <*> stringTokExtend)
 >     <|> (names <**> option Iden app)

 === special
@ -731,9 +730,11 @@ operatorname(firstArg keyword0 arg0 keyword1 arg1 etc.)
 >               e <- valueExpr
 >               -- check we haven't parsed the first
 >               -- keyword as an identifier
->               guard (case (e,kws) of
->                   (Iden [Name i], (k,_):_) | map toLower i == k -> False
->                   _ -> True)
+>               case (e,kws) of
+>                   (Iden [Name i], (k,_):_)
+>                       | map toLower i == k ->
+>                           fail $ "cannot use keyword here: " ++ i
+>                   _ -> return ()
 >               pure e
 >     fa <- case firstArg of
 >          SOKNone -> pure Nothing
@ -806,7 +807,7 @@ in the source
 >     keyword "trim" >>
 >     parens (mkTrim
 >             <$> option "both" sides
->             <*> option " " stringToken
+>             <*> option " " stringTok
 >             <*> (keyword_ "from" *> valueExpr))
 >   where
 >     sides = choice ["leading" <$ keyword_ "leading"
@ -986,13 +987,28 @@ a match (select a from t)

 === escape

+It is going to be really difficult to support an arbitrary character
+for the escape now there is a separate lexer ...
+
 > escapeSuffix :: Parser (ValueExpr -> ValueExpr)
 > escapeSuffix = do
 >     ctor <- choice
 >             [Escape <$ keyword_ "escape"
 >             ,UEscape <$ keyword_ "uescape"]
->     c <- anyChar
+>     c <- escapeChar
 >     pure $ \v -> ctor v c
+>   where
+>     escapeChar = escapeIden <|> escapeSym
+>     escapeIden = do
+>                  c <- identifierTok
+>                  case c of
+>                    [c'] -> return c'
+>                    _ -> fail "escape char must be single char"
+>     escapeSym = do
+>                c <- symbolTok
+>                case c of
+>                  [c'] -> return c'
+>                  _ -> fail "escape char must be single char"

 === collate

@ -1018,7 +1034,7 @@ syntax is way too messy. It might be possible to avoid this if we
 wanted to avoid extensibility and to not be concerned with parse error
 messages, but both of these are too important.

-> opTable :: Bool -> [[E.Operator String ParseState Identity ValueExpr]]
+> opTable :: Bool -> [[E.Operator [Token] ParseState Identity ValueExpr]]
 > opTable bExpr =
 >         [-- parse match and quantified comparisons as postfix ops
 >           -- todo: left factor the quantified comparison with regular
@ -1452,163 +1468,134 @@ thick.

 ------------------------------------------------

-= lexing parsers
+= lexing

-whitespace parser which skips comments also
+TODO: push checks into here:
+keyword blacklists
+unsigned integer match
+symbol matching
+keyword matching

-> whitespace :: Parser ()
-> whitespace =
->     choice [simpleWhitespace *> whitespace
->            ,lineComment *> whitespace
->            ,blockComment *> whitespace
->            ,pure ()] <?> "whitespace"
+> csSqlStringLitTok :: Parser (String,String)
+> csSqlStringLitTok = mytoken (\tok ->
+>     case tok of
+>       L.CSSqlString p s -> Just (p,s)
+>       _ -> Nothing)
+
+> stringTok :: Parser String
+> stringTok = mytoken (\tok ->
+>     case tok of
+>       L.SqlString s -> Just s
+>       _ -> Nothing)
+
+This is to support SQL strings where you can write
+'part of a string' ' another part'
+and it will parse as a single string
+
+> stringTokExtend :: Parser String
+> stringTokExtend = do
+>     x <- stringTok
+>     choice [
+>         ((x++) <$> stringTokExtend)
+>         ,return x
+>         ]
+
+> hostParamTok :: Parser String
+> hostParamTok = mytoken (\tok ->
+>     case tok of
+>       L.HostParam p -> Just p
+>       _ -> Nothing)
+
+> sqlNumberTok :: Parser String
+> sqlNumberTok = mytoken (\tok ->
+>     case tok of
+>       L.SqlNumber p -> Just p
+>       _ -> Nothing)
+
+
+> symbolTok :: Parser String
+> symbolTok = mytoken (\tok ->
+>     case tok of
+>       L.Symbol p -> Just p
+>       _ -> Nothing)
+
+> identifierTok :: Parser String
+> identifierTok = mytoken (\tok ->
+>     case tok of
+>       L.Identifier p -> Just p
+>       _ -> Nothing)
+
+> qidentifierTok :: Parser String
+> qidentifierTok = mytoken (\tok ->
+>     case tok of
+>       L.QIdentifier p -> Just p
+>       _ -> Nothing)
+
+> dqidentifierTok :: Parser (String,String,String)
+> dqidentifierTok = mytoken (\tok ->
+>     case tok of
+>       L.DQIdentifier s e t -> Just (s,e,t)
+>       _ -> Nothing)
+
+> uqidentifierTok :: Parser String
+> uqidentifierTok = mytoken (\tok ->
+>     case tok of
+>       L.UQIdentifier p -> Just p
+>       _ -> Nothing)
+
+
+> mytoken :: (L.Token -> Maybe a) -> Parser a
+> mytoken test = token showToken posToken testToken
 >   where
->     lineComment = try (string "--")
->                   *> manyTill anyChar (void (char '\n') <|> eof)
->     blockComment = -- no nesting of block comments in SQL
->                    try (string "/*")
->                    -- try used here so it doesn't fail when we see a
->                    -- '*' which isn't followed by a '/'
->                    *> manyTill anyChar (try $ string "*/")
->     -- use many1 so we can more easily avoid non terminating loops
->     simpleWhitespace = void $ many1 (oneOf " \t\n")
-
-> lexeme :: Parser a -> Parser a
-> lexeme p = p <* whitespace
+>     showToken (_,tok)   = show tok
+>     posToken  ((a,b,c),_)  = newPos a b c
+>     testToken (_,tok)   = test tok

 > unsignedInteger :: Parser Integer
-> unsignedInteger = read <$> lexeme (many1 digit) <?> "integer"
-
-
-number literals
-
-here is the rough grammar target:
-
-digits
-digits.[digits][e[+-]digits]
-[digits].digits[e[+-]digits]
-digitse[+-]digits
-
-numbers are parsed to strings, not to a numeric type. This is to avoid
-making a decision on how to represent numbers, the client code can
-make this choice.
-
-> numberLiteral :: Parser String
-> numberLiteral = lexeme (
->     (int <??> (pp dot <??.> pp int)
->      <|> (++) <$> dot <*> int)
->     <??> pp expon)
->   where
->     int = many1 digit
->     dot = string "."
->     expon = (:) <$> oneOf "eE" <*> sInt
->     sInt = (++) <$> option "" (string "+" <|> string "-") <*> int
->     pp = (<$$> (++))
-
-
-> identifier :: Parser String
-> identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar)
->              <?> "identifier"
->   where
->     firstChar = letter <|> char '_' <?> "identifier"
->     nonFirstChar = digit <|> firstChar <?> ""
-
-> quotedIdentifier :: Parser String
-> quotedIdentifier = quotedIdenHelper
-
-> quotedIdenHelper :: Parser String
-> quotedIdenHelper =
->     lexeme (dq *> manyTill anyChar dq >>= optionSuffix moreIden)
->     <?> "identifier"
->   where
->     moreIden s0 = do
->          void dq
->          s <- manyTill anyChar dq
->          optionSuffix moreIden (s0 ++ "\"" ++ s)
->     dq = char '"' <?> "double quote"
-
-> uquotedIdentifier :: Parser String
-> uquotedIdentifier =
->   try (string "u&" <|> string "U&") *> quotedIdenHelper
->   <?> "identifier"
-
-parses an identifier with a : prefix. The : isn't included in the
-return value
-
-> hostParameterToken :: Parser String
-> hostParameterToken = lexeme $ char ':' *> identifier
+> unsignedInteger = try (do
+>                    x <- sqlNumberTok
+>                    guard (all isDigit x)
+>                    return $ read x
+>                    ) <?> "integer"

 todo: work out the symbol parsing better

 > symbol :: String -> Parser String
-> symbol s = try (lexeme $ do
->     u <- choice (many1 (char '.') :
->                  map (try . string) [">=","<=","!=","<>","||"]
->                  ++ map (string . (:[])) "+-^*/%~&|<>=")
+> symbol s = try (do
+>   u <- symbolTok
 >   guard (s == u)
->     pure s)
->     <?> s
+>   pure s) <?> s
+
+> singleCharSymbol :: Char -> Parser Char
+> singleCharSymbol c = c <$ symbol [c]

 > questionMark :: Parser Char
-> questionMark = lexeme (char '?') <?> "question mark"
+> questionMark = singleCharSymbol '?' <?> "question mark"

 > openParen :: Parser Char
-> openParen = lexeme $ char '('
+> openParen = singleCharSymbol '('

 > closeParen :: Parser Char
-> closeParen = lexeme $ char ')'
+> closeParen = singleCharSymbol ')'

 > openBracket :: Parser Char
-> openBracket = lexeme $ char '['
+> openBracket = singleCharSymbol '['

 > closeBracket :: Parser Char
-> closeBracket = lexeme $ char ']'
+> closeBracket = singleCharSymbol ']'


 > comma :: Parser Char
-> comma = lexeme (char ',') <?> "comma"
+> comma = singleCharSymbol ','

 > semi :: Parser Char
-> semi = lexeme (char ';') <?> "semicolon"
-
-> quote :: Parser Char
-> quote = lexeme (char '\'') <?> "single quote"
-
-> --stringToken :: Parser String
-> --stringToken = lexeme (char '\'' *> manyTill anyChar (char '\''))
-> -- todo: tidy this up, add the prefixes stuff, and add the multiple
-> -- string stuff
-> stringToken :: Parser String
-> stringToken =
->     lexeme (nlquote *> manyTill anyChar nlquote
->     >>= optionSuffix moreString)
->     <?> "string"
->   where
->     moreString s0 = choice
->         [-- handle two adjacent quotes
->          do
->          void nlquote
->          s <- manyTill anyChar nlquote
->          optionSuffix moreString (s0 ++ "'" ++ s)
->         ,-- handle string in separate parts
->          -- e.g. 'part 1' 'part 2'
->          do --can this whitespace be factored out?
->             -- since it will be parsed twice when there is no more literal
->             -- yes: split the adjacent quote and multiline literal
->             -- into two different suffixes
->             -- won't need to call lexeme at the top level anymore after this
->          try (whitespace <* nlquote)
->          s <- manyTill anyChar nlquote
->          optionSuffix moreString (s0 ++ s)
->         ]
->     -- non lexeme quote
->     nlquote = char '\'' <?> "single quote"
+> semi = singleCharSymbol ';'

 = helper functions

 > keyword :: String -> Parser String
 > keyword k = try (do
->     i <- identifier
+>     i <- identifierTok
 >     guard (map toLower i == k)
 >     pure k) <?> k

@ -1638,7 +1625,7 @@ helper function to improve error messages

 > identifierBlacklist :: [String] -> Parser String
 > identifierBlacklist bl = try (do
->     i <- identifier
+>     i <- identifierTok
 >     when (map toLower i `elem` bl) $
 >         fail $ "keyword not allowed here: " ++ i
 >     pure i)
@ -2001,7 +1988,10 @@ different parsers can be used for different dialects

 > type ParseState = Dialect

-> type Parser = Parsec String ParseState
+> type Token = (L.Position,L.Token)
+
+> --type Parser = Parsec String ParseState
+> type Parser = GenParser Token ParseState

 > guardDialect :: [Dialect] -> Parser ()
 > guardDialect ds = do
--- a/simple-sql-parser.cabal
+++ b/simple-sql-parser.cabal
@ -29,6 +29,7 @@ Flag sqlindent
 library
  exposed-modules:     Language.SQL.SimpleSQL.Pretty,
                       Language.SQL.SimpleSQL.Parser,
+                       Language.SQL.SimpleSQL.Lexer,
                       Language.SQL.SimpleSQL.Syntax
  Other-Modules:       Language.SQL.SimpleSQL.Errors,
                       Language.SQL.SimpleSQL.Combinators
@ -55,6 +56,7 @@ Test-Suite Tests

  Other-Modules:       Language.SQL.SimpleSQL.Pretty,
                       Language.SQL.SimpleSQL.Parser,
+                       Language.SQL.SimpleSQL.Lexer,
                       Language.SQL.SimpleSQL.Syntax,
                       Language.SQL.SimpleSQL.Errors,
                       Language.SQL.SimpleSQL.Combinators
@ -71,11 +73,12 @@ Test-Suite Tests
                       Language.SQL.SimpleSQL.TestTypes,
                       Language.SQL.SimpleSQL.Tests,
                       Language.SQL.SimpleSQL.Tpch,
-                       Language.SQL.SimpleSQL.ValueExprs
+                       Language.SQL.SimpleSQL.ValueExprs,
+                       Language.SQL.SimpleSQL.LexerTests

  other-extensions:    TupleSections,DeriveDataTypeable
  default-language:    Haskell2010
-  ghc-options:         -Wall
+  ghc-options:         -Wall -threaded

 executable SQLIndent
  main-is:             SQLIndent.lhs
@ -91,3 +94,15 @@ executable SQLIndent
    buildable:         True
  else
    buildable:         False
+
+executable TestLex
+  main-is:             TestLex.lhs
+  hs-source-dirs:      .,tools
+  Build-Depends:       base >=4.5 && <4.9,
+                       parsec >=3.1 && <3.2,
+                       mtl >=2.1 && <2.3,
+                       pretty >= 1.1 && < 1.2
+  other-extensions:    TupleSections,DeriveDataTypeable
+  default-language:    Haskell2010
+  ghc-options:         -Wall
+  buildable:           False
--- a/tools/Language/SQL/SimpleSQL/LexerTests.lhs
+++ b/tools/Language/SQL/SimpleSQL/LexerTests.lhs
@ -0,0 +1,144 @@
+
+
+Test for the lexer
+
+> module Language.SQL.SimpleSQL.LexerTests (lexerTests) where
+
+> import Language.SQL.SimpleSQL.TestTypes
+> import Language.SQL.SimpleSQL.Lexer (Token(..))
+> --import Debug.Trace
+
+> lexerTable :: [(String,[Token])]
+> lexerTable =
+>     -- single char symbols
+>     map (\s -> ([s],[Symbol [s]])) "+-^*/%~&|?<>[]=,;()"
+>     -- multi char symbols
+>     ++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||"]
+>     ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"]
+>         -- simple identifiers
+>         in map (\i -> (i, [Identifier i])) idens
+>            ++ map (\i -> ("\"" ++ i ++ "\"", [QIdentifier i])) idens
+>            -- todo: in order to make lex . pretty id, need to
+>            -- preserve the case of the u
+>            ++ map (\i -> ("u&\"" ++ i ++ "\"", [UQIdentifier i])) idens
+>            -- host param
+>            ++ map (\i -> (':':i, [HostParam i])) idens
+>        )
+>     -- quoted identifiers with embedded double quotes
+>     ++ [("\"normal \"\" iden\"", [QIdentifier "normal \" iden"])]
+>     -- strings
+>     ++ [("'string'", [SqlString "string"])
+>        ,("'normal '' quote'", [SqlString "normal ' quote"])
+>        ,("'normalendquote '''", [SqlString "normalendquote '"])]
+>     -- csstrings
+>     ++ map (\c -> (c ++ "'test'", [CSSqlString c "test"]))
+>        ["n", "N","b", "B","x", "X", "u&"]
+>     -- numbers
+>     ++ [("10", [SqlNumber "10"])
+>        ,(".1", [SqlNumber ".1"])
+>        ,("5e3", [SqlNumber "5e3"])
+>        ,("5e+3", [SqlNumber "5e+3"])
+>        ,("5e-3", [SqlNumber "5e-3"])
+>        ,("10.2", [SqlNumber "10.2"])
+>        ,("10.2e7", [SqlNumber "10.2e7"])]
+>     -- whitespace
+>     ++ concat [[([a],[Whitespace [a]])
+>                ,([a,b], [Whitespace [a,b]])]
+>               | a <- " \n\t", b <- " \n\t"]
+>     -- line comment
+>     ++ map (\c -> (c, [LineComment c]))
+>        ["--", "-- ", "-- this is a comment"]
+>     -- block comment
+>     ++ map (\c -> (c, [BlockComment c]))
+>        ["/**/", "/* */","/* this is a comment */"
+>         -- todo: bug in the lexer, fix this
+>         --,"/* this *is/ a comment */"
+>        ]
+
+
+> lexerTests :: TestItem
+> lexerTests = Group "lexerTests" $
+>     [LexerTest SQL2011 s t |  (s,t) <- lexerTable]
+>     ++
+>     [ LexerTest SQL2011 (s ++ s1) (t ++ t1)
+>     | (s,t) <- lexerTable
+>     , (s1,t1) <- lexerTable
+
+which combinations won't work:
+<> <= >= || two single symbols which make a double char symbol
+identifier + identifier if both are quoted or unquoted
+string string
+csstring string
+line comment anything (can add newline?)
+number number (todo: double check more carefully)
+
+>     , isGood $ t ++ t1
+
+>     ]
+>     ++ map (uncurry $ LexerTest SQL2011)
+>        [("", [])
+>        ]
+
+>  where
+>    isGood :: [Token] -> Bool
+>    isGood l = {-let b =-} and $ map not [p l | p <- map listPred badCombos]
+>               -- in trace ("isGood " ++ show (l,b)) b
+>    badCombos :: [((Token -> Bool),(Token -> Bool))]
+>    badCombos = [symbolPair "<" ">"
+>                ,symbolPair "<" "="
+>                ,symbolPair ">" "="
+>                ,symbolPair "!" "="
+>                ,symbolPair "|" "|"
+>                ,symbolPair "||" "|"
+>                ,symbolPair "|" "||"
+>                ,symbolPair "||" "||"
+>                ,symbolPair "<" ">="
+
+>                ,symbolPair "-" "-"
+>                ,symbolPair "/" "*"
+>                ,symbolPair "*" "/"
+
+>                ,(isIdentifier, isIdentifier)
+>                ,(isQIdentifier, isQIdentifier)
+>                ,(isUQIdentifier, isQIdentifier)
+>                ,(isString, isString)
+>                ,(isCsString, isString)
+>                ,(isLineComment, const True)
+>                ,(isNumber, isNumber)
+>                ,(isHostParam,isIdentifier)
+>                ,(isHostParam,isCsString)
+>                ,(isHostParam,isUQIdentifier)
+>                ,(isIdentifier,isCsString)
+>                ,(isIdentifier,isUQIdentifier)
+>                ,(isWhitespace, isWhitespace)
+>                ,(isIdentifier, isNumber)
+>                ,(isHostParam, isNumber)
+>                ,(isMinus, isLineComment)
+>                ]
+>    isIdentifier (Identifier _) = True
+>    isIdentifier _ = False
+>    isQIdentifier (QIdentifier _) = True
+>    isQIdentifier _ = False
+>    isUQIdentifier (UQIdentifier _) = True
+>    isUQIdentifier _ = False
+>    isCsString (CSSqlString {}) = True
+>    isCsString _ = False
+>    isLineComment (LineComment{}) = True
+>    isLineComment _ = False
+>    isNumber (SqlNumber{}) = True
+>    isNumber _ = False
+>    isHostParam (HostParam{}) = True
+>    isHostParam _ = False
+>    isWhitespace (Whitespace{}) = True
+>    isWhitespace _ = False
+>    isMinus (Symbol "-") = True
+>    isMinus _ = False
+
+>    isString (SqlString _) = True
+>    isString _ = False
+>    symbolPair a b = ((==Symbol a), (==Symbol b))
+>    listPred :: ((Token -> Bool),(Token -> Bool)) -> [Token] -> Bool
+>    listPred _ [] = False
+>    listPred _ [_] = False
+>    listPred (p,p1) (t:t1:ts) | p t && p1 t1 = True
+>                              | otherwise = listPred (p,p1) (t1:ts)
--- a/tools/Language/SQL/SimpleSQL/TestTypes.lhs
+++ b/tools/Language/SQL/SimpleSQL/TestTypes.lhs
@ -7,6 +7,7 @@ Tests.lhs module for the 'interpreter'.
 >     ,Dialect(..)) where

 > import Language.SQL.SimpleSQL.Syntax
+> import Language.SQL.SimpleSQL.Lexer (Token)

 TODO: maybe make the dialect args into [dialect], then each test
 checks all the dialects mentioned work, and all the dialects not
@ -28,4 +29,5 @@ check that the string given fails to parse

 >               | ParseQueryExprFails Dialect String
 >               | ParseValueExprFails Dialect String
+>               | LexerTest Dialect String [Token]
 >                 deriving (Eq,Show)
--- a/tools/Language/SQL/SimpleSQL/Tests.lhs
+++ b/tools/Language/SQL/SimpleSQL/Tests.lhs
@ -15,6 +15,7 @@ test data to the Test.Framework tests.
 > --import Language.SQL.SimpleSQL.Syntax
 > import Language.SQL.SimpleSQL.Pretty
 > import Language.SQL.SimpleSQL.Parser
+> import Language.SQL.SimpleSQL.Lexer

 > import Language.SQL.SimpleSQL.TestTypes

@ -26,6 +27,7 @@ test data to the Test.Framework tests.
 > import Language.SQL.SimpleSQL.TableRefs
 > import Language.SQL.SimpleSQL.ValueExprs
 > import Language.SQL.SimpleSQL.Tpch
+> import Language.SQL.SimpleSQL.LexerTests

 > import Language.SQL.SimpleSQL.SQL2011

@ -37,7 +39,8 @@ order on the generated documentation.
 > testData :: TestItem
 > testData =
 >     Group "parserTest"
->     [valueExprTests
+>     [lexerTests
+>     ,valueExprTests
 >     ,queryExprComponentTests
 >     ,queryExprsTests
 >     ,tableRefTests
@ -73,6 +76,14 @@ order on the generated documentation.
 > itemToTest (ParseValueExprFails d str) =
 >     toFTest parseValueExpr prettyValueExpr d str

+> itemToTest (LexerTest d s ts) = makeLexerTest d s ts
+
+> makeLexerTest :: Dialect -> String -> [Token] -> T.TestTree
+> makeLexerTest d s ts = H.testCase s $ do
+>     let lx = either (error . show) id $ lexSQL d ("", 1, 1) s
+>     H.assertEqual "" ts $ map snd lx
+>     let s' = prettyTokens d $ map snd lx
+>     H.assertEqual "pretty print" s s'

 > toTest :: (Eq a, Show a) =>
 >           (Dialect -> String -> Maybe (Int,Int) -> String -> Either ParseError a)
@ -123,9 +134,9 @@ order on the generated documentation.
 >        -> Dialect
 >        -> String
 >        -> T.TestTree
-> toFTest parser pp d str = H.testCase str $ do
+> toFTest parser _pp d str = H.testCase str $ do
 >         let egot = parser d "" Nothing str
 >         case egot of
->             Left e -> return ()
->             Right got ->
+>             Left _e -> return ()
+>             Right _got ->
 >               H.assertFailure $ "parse didn't fail: " ++ show d ++ "\n" ++ str