1
Fork 0

get old lexing code working again, now only 3 tests fail

This commit is contained in:
Jake Wheat 2024-01-10 11:28:34 +00:00
parent 0f307f51c7
commit 4e09fe9f45
5 changed files with 167 additions and 269 deletions

View file

@ -19,6 +19,21 @@ directly without the separately testing lexing stage.
TODO: TODO:
optimisations:
check for left factor opportunities
check for places where it parses a few substrings from the source,
then puts them back together with a concatenate of some flavour
-> this is better if can find a way to parse the entire string
from the source and lift it in one go into the lexical token
before this is done, a smaller optimisation is when any code matches
a constant string in the lexer, use that constant string instead
of the string from the parser, it might make a small difference in
a few places
maybe every token should carry the exact source as well as any fields
it's been broken into - so pretty printing is trivial
make the tokenswill print more dialect accurate. Maybe add symbol make the tokenswill print more dialect accurate. Maybe add symbol
chars and identifier chars to the dialect definition and use them from chars and identifier chars to the dialect definition and use them from
here here
@ -98,12 +113,19 @@ import Text.Megaparsec
,many ,many
,try ,try
,option ,option
,(<|>)
,notFollowedBy
,manyTill
,anySingle
,lookAhead
) )
import qualified Text.Megaparsec as M
import Text.Megaparsec.Char import Text.Megaparsec.Char
(string (string
,char ,char
) )
import Text.Megaparsec.State (initialState) import Text.Megaparsec.State (initialState)
import Control.Applicative ((<**>))
import Data.Void (Void) import Data.Void (Void)
@ -113,7 +135,7 @@ import Data.Char
,isSpace ,isSpace
,isDigit ,isDigit
) )
import Control.Monad (void) import Control.Monad (void, guard)
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
@ -241,7 +263,7 @@ sqlToken d = do
,blockComment d ,blockComment d
,sqlNumber d ,sqlNumber d
,positionalArg d ,positionalArg d
--,dontParseEndBlockComment d ,dontParseEndBlockComment d
,prefixedVariable d ,prefixedVariable d
,symbol d ,symbol d
,sqlWhitespace d] ,sqlWhitespace d]
@ -251,10 +273,6 @@ sqlToken d = do
-------------------------------------- --------------------------------------
sqlString :: Dialect -> Parser Token
sqlString _d =
SqlString "'" "'" <$> (char_ '\'' *> takeWhileP (Just "non quote char") (/= '\'') <* char_ '\'')
{- {-
Parse a SQL string. Examples: Parse a SQL string. Examples:
@ -265,7 +283,6 @@ b'binary string'
x'hexidecimal string' x'hexidecimal string'
-} -}
{-
sqlString :: Dialect -> Parser Token sqlString :: Dialect -> Parser Token
sqlString d = dollarString <|> csString <|> normalString sqlString d = dollarString <|> csString <|> normalString
where where
@ -273,21 +290,21 @@ sqlString d = dollarString <|> csString <|> normalString
guard $ diDollarString d guard $ diDollarString d
-- use try because of ambiguity with symbols and with -- use try because of ambiguity with symbols and with
-- positional arg -- positional arg
delim <- (\x -> concat ["$",x,"$"]) delim <- (\x -> T.concat ["$",x,"$"])
<$> try (char '$' *> option "" identifierString <* char '$') <$> try (char '$' *> option "" identifierString <* char '$')
SqlString delim delim <$> manyTill anyChar (try $ string delim) SqlString delim delim . T.pack <$> manyTill anySingle (try $ string delim)
normalString = SqlString "'" "'" <$> (char '\'' *> normalStringSuffix False "") normalString = SqlString "'" "'" <$> (char '\'' *> normalStringSuffix False "")
normalStringSuffix allowBackslash t = do normalStringSuffix allowBackslash t = do
s <- takeTill $ if allowBackslash s <- takeWhileP Nothing $ if allowBackslash
then (`elem` "'\\") then (`notElemChar` "'\\")
else (== '\'') else (/= '\'')
-- deal with '' or \' as literal quote character -- deal with '' or \' as literal quote character
choice [do choice [do
ctu <- choice ["''" <$ try (string "''") ctu <- choice ["''" <$ try (string "''")
,"\\'" <$ string "\\'" ,"\\'" <$ string "\\'"
,"\\" <$ char '\\'] ,"\\" <$ char '\\']
normalStringSuffix allowBackslash $ concat [t,s,ctu] normalStringSuffix allowBackslash $ T.concat [t,s,ctu]
,concat [t,s] <$ char '\''] ,T.concat [t,s] <$ char '\'']
-- try is used to to avoid conflicts with -- try is used to to avoid conflicts with
-- identifiers which can start with n,b,x,u -- identifiers which can start with n,b,x,u
-- once we read the quote type and the starting ' -- once we read the quote type and the starting '
@ -299,38 +316,19 @@ sqlString d = dollarString <|> csString <|> normalString
csString csString
| diEString d = | diEString d =
choice [SqlString <$> try (string "e'" <|> string "E'") choice [SqlString <$> try (string "e'" <|> string "E'")
<*> return "'" <*> normalStringSuffix True "" <*> pure "'" <*> normalStringSuffix True ""
,csString'] ,csString']
| otherwise = csString' | otherwise = csString'
csString' = SqlString csString' = SqlString
<$> try cs <$> try cs
<*> return "'" <*> pure "'"
<*> normalStringSuffix False "" <*> normalStringSuffix False ""
csPrefixes = "nNbBxX" csPrefixes = (map (flip T.cons "'") "nNbBxX") ++ ["u&'", "U&'"]
cs = choice $ (map (\x -> string ([x] ++ "'")) csPrefixes) cs :: Parser Text
++ [string "u&'" cs = choice $ map string csPrefixes
,string "U&'"]
-}
-------------------------------------- --------------------------------------
-- TODO: this reconstitutes the string from bits, instead of lifting
-- it in one piece from the source. This is a performance issue, not
-- sure if it will be significant. The same comment applies to most of
-- the other parsers
identifier :: Dialect -> Parser Token
identifier d = Identifier Nothing <$> identifierString d
identifierString :: Dialect -> Parser Text
identifierString _ = (do
c <- satisfy isFirstLetter
choice
[T.cons c <$> (takeWhileP (Just "identifier char") isNonFirstLetter)
,pure $ T.singleton c]) <?> "identifier"
where
isFirstLetter c = c == '_' || isAlpha c
isNonFirstLetter c = c == '_' || isAlphaNum c
{- {-
Parses identifiers: Parses identifiers:
@ -341,7 +339,6 @@ u&"unicode quoted identifier"
`mysql quoted identifier` `mysql quoted identifier`
-} -}
{-
identifier :: Dialect -> Parser Token identifier :: Dialect -> Parser Token
identifier d = identifier d =
choice choice
@ -355,50 +352,39 @@ identifier d =
regularIden = Identifier Nothing <$> identifierString regularIden = Identifier Nothing <$> identifierString
quotedIden = Identifier (Just ("\"","\"")) <$> qidenPart quotedIden = Identifier (Just ("\"","\"")) <$> qidenPart
mySqlQuotedIden = Identifier (Just ("`","`")) mySqlQuotedIden = Identifier (Just ("`","`"))
<$> (char '`' *> takeWhile1 (/='`') <* char '`') <$> (char '`' *> takeWhile1P Nothing (/='`') <* char '`')
sqlServerQuotedIden = Identifier (Just ("[","]")) sqlServerQuotedIden = Identifier (Just ("[","]"))
<$> (char '[' *> takeWhile1 (`notElem` "[]") <* char ']') <$> (char '[' *> takeWhile1P Nothing (`notElemChar` "[]") <* char ']')
-- try is used here to avoid a conflict with identifiers -- try is used here to avoid a conflict with identifiers
-- and quoted strings which also start with a 'u' -- and quoted strings which also start with a 'u'
unicodeQuotedIden = Identifier unicodeQuotedIden = Identifier
<$> (f <$> try (oneOf "uU" <* string "&")) <$> (f <$> try ((oneOf "uU") <* string "&"))
<*> qidenPart <*> qidenPart
where f x = Just (x: "&\"", "\"") where f x = Just (T.cons x "&\"", "\"")
qidenPart = char '"' *> qidenSuffix "" qidenPart = char '"' *> qidenSuffix ""
qidenSuffix t = do qidenSuffix t = do
s <- takeTill (=='"') s <- takeWhileP Nothing (/='"')
void $ char '"' void $ char '"'
-- deal with "" as literal double quote character -- deal with "" as literal double quote character
choice [do choice [do
void $ char '"' void $ char '"'
qidenSuffix $ concat [t,s,"\"\""] qidenSuffix $ T.concat [t,s,"\"\""]
,return $ concat [t,s]] ,pure $ T.concat [t,s]]
identifierString :: Parser Text
-- This parses a valid identifier without quotes. identifierString = (do
c <- satisfy isFirstLetter
identifierString :: Parser String choice
identifierString = [T.cons c <$> (takeWhileP (Just "identifier char") isIdentifierChar)
startsWith (\c -> c == '_' || isAlpha c) isIdentifierChar ,pure $ T.singleton c]) <?> "identifier"
where
-- this can be moved to the dialect at some point isFirstLetter c = c == '_' || isAlpha c
isIdentifierChar :: Char -> Bool isIdentifierChar :: Char -> Bool
isIdentifierChar c = c == '_' || isAlphaNum c isIdentifierChar c = c == '_' || isAlphaNum c
-}
-------------------------------------- --------------------------------------
{-
I think it's always faster to use a string locally created in the parser code,
than to use one taken from the parsed source, unless you take it without modifying it,
the example here is using -- and \n. this won't be needed in this case if can work out
how to lift the entire comment as a single string from the source.
this concept does apply to things like symbols
-}
lineComment :: Dialect -> Parser Token lineComment :: Dialect -> Parser Token
lineComment _ = do lineComment _ = do
try (string_ "--") <?> "" try (string_ "--") <?> ""
@ -407,62 +393,22 @@ lineComment _ = do
suf <- option "" ("\n" <$ char_ '\n') suf <- option "" ("\n" <$ char_ '\n')
pure $ LineComment $ T.concat ["--", rest, suf] pure $ LineComment $ T.concat ["--", rest, suf]
{-lineComment :: Dialect -> Parser Token
lineComment _ =
(\s -> LineComment $ concat ["--",s]) <$>
-- try is used here in case we see a - symbol
-- once we read two -- then we commit to the comment token
(try (string "--") *> (
-- todo: there must be a better way to do this
conc <$> manyTill anyChar (lookAhead lineCommentEnd) <*> lineCommentEnd))
where
conc a Nothing = a
conc a (Just b) = a ++ b
lineCommentEnd =
Just "\n" <$ char '\n'
<|> Nothing <$ eof-}
-------------------------------------- --------------------------------------
-- TODO: the parser before the switch to megaparsec parsed nested block comments
-- I don't know any dialects that use this, but I think it's useful, if needed,
-- add it back in under a dialect flag?
blockComment :: Dialect -> Parser Token blockComment :: Dialect -> Parser Token
blockComment _ = (do blockComment _ = (do
try $ string_ "/*" try $ string_ "/*"
BlockComment . T.concat . ("/*":) <$> more) <?> "" BlockComment . T.concat . ("/*":) <$> more) <?> ""
where where
more = choice more = choice
[["*/"] <$ try (string_ "*/") [["*/"] <$ try (string_ "*/") -- comment ended
,char_ '*' *> (("*":) <$> more) ,char_ '*' *> (("*":) <$> more) -- comment contains * but this isn't the comment end token
-- not sure if there's an easy optimisation here
,(:) <$> takeWhile1P (Just "non comment terminator text") (/= '*') <*> more] ,(:) <$> takeWhile1P (Just "non comment terminator text") (/= '*') <*> more]
{-
Try is used in the block comment for the two symbol bits because we
want to backtrack if we read the first symbol but the second symbol
isn't there.
-}
{-
blockComment :: Dialect -> Parser Token
blockComment _ =
(\s -> BlockComment $ concat ["/*",s]) <$>
(try (string "/*") *> commentSuffix 0)
where
commentSuffix :: Int -> Parser String
commentSuffix n = do
-- read until a possible end comment or nested comment
x <- takeWhile (\e -> e /= '/' && e /= '*')
choice [-- close comment: if the nesting is 0, done
-- otherwise recurse on commentSuffix
try (string "*/") *> let t = concat [x,"*/"]
in if n == 0
then return t
else (\s -> concat [t,s]) <$> commentSuffix (n - 1)
-- nested comment, recurse
,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
-- not an end comment or nested comment, continue
,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
-}
{- {-
This is to improve user experience: provide an error if we see */ This is to improve user experience: provide an error if we see */
outside a comment. This could potentially break postgres ops with */ outside a comment. This could potentially break postgres ops with */
@ -470,23 +416,14 @@ in them (which is a stupid thing to do). In other cases, the user
should write * / instead (I can't think of any cases when this would should write * / instead (I can't think of any cases when this would
be valid syntax though). be valid syntax though).
-} -}
{-
dontParseEndBlockComment :: Dialect -> Parser Token dontParseEndBlockComment :: Dialect -> Parser Token
dontParseEndBlockComment _ = dontParseEndBlockComment _ =
-- don't use try, then it should commit to the error -- don't use try, then it should commit to the error
try (string "*/") *> fail "comment end without comment start" try (string "*/") *> fail "comment end without comment start"
-}
-------------------------------------- --------------------------------------
sqlNumber :: Dialect -> Parser Token
sqlNumber _ =
SqlNumber <$> digits
digits :: Parser Text
digits = takeWhile1P (Just "digit") isDigit
{- {-
numbers numbers
@ -502,9 +439,18 @@ present. There cannot be any spaces or other characters embedded in
the constant. Note that any leading plus or minus sign is not actually the constant. Note that any leading plus or minus sign is not actually
considered part of the constant; it is an operator applied to the considered part of the constant; it is an operator applied to the
constant. constant.
algorithm:
either
parse 1 or more digits
then an optional dot which isn't two dots
then optional digits
or: parse a dot which isn't two dots
then digits
followed by an optional exponent
-} -}
{-
sqlNumber :: Dialect -> Parser Token sqlNumber :: Dialect -> Parser Token
sqlNumber d = sqlNumber d =
SqlNumber <$> completeNumber SqlNumber <$> completeNumber
@ -517,15 +463,14 @@ sqlNumber d =
] ]
where where
completeNumber = completeNumber =
(int <??> (pp dot <??.> pp int) (digits <??> (pp dot <??.> pp digits)
-- try is used in case we read a dot -- try is used in case we read a dot
-- and it isn't part of a number -- and it isn't part of a number
-- if there are any following digits, then we commit -- if there are any following digits, then we commit
-- to it being a number and not something else -- to it being a number and not something else
<|> try ((++) <$> dot <*> int)) <|> try ((<>) <$> dot <*> digits))
<??> pp expon <??> pp expon
int = many1 digit
-- make sure we don't parse two adjacent dots in a number -- make sure we don't parse two adjacent dots in a number
-- special case for postgresql, we backtrack if we see two adjacent dots -- special case for postgresql, we backtrack if we see two adjacent dots
-- to parse 1..2, but in other dialects we commit to the failure -- to parse 1..2, but in other dialects we commit to the failure
@ -533,38 +478,31 @@ sqlNumber d =
in if diPostgresSymbols d in if diPostgresSymbols d
then try p then try p
else p else p
expon = (:) <$> oneOf "eE" <*> sInt expon = T.cons <$> oneOf "eE" <*> sInt
sInt = (++) <$> option "" (string "+" <|> string "-") <*> int sInt = (<>) <$> option "" (T.singleton <$> oneOf "+-") <*> digits
pp = (<$$> (++)) pp = (<$$> (<>))
-} p <??> q = p <**> option id q
pa <$$> c = pa <**> pure (flip c)
pa <??.> pb =
let c = (<$>) . flip
in (.) `c` pa <*> option id pb
digits :: Parser Text
digits = takeWhile1P (Just "digit") isDigit
-------------------------------------- --------------------------------------
positionalArg :: Dialect -> Parser Token
positionalArg _ = PositionalArg <$> (char_ '$' *> (read . T.unpack <$> digits))
{-
positionalArg :: Dialect -> Parser Token positionalArg :: Dialect -> Parser Token
positionalArg d = positionalArg d =
guard (diPositionalArg d) >> guard (diPositionalArg d) >>
-- use try to avoid ambiguities with other syntax which starts with dollar -- use try to avoid ambiguities with other syntax which starts with dollar
PositionalArg <$> try (char '$' *> (read <$> many1 digit)) PositionalArg <$> try (char_ '$' *> (read . T.unpack <$> digits))
-}
-------------------------------------- --------------------------------------
-- todo: I think the try here should read a prefix char, then a single valid -- todo: I think the try here should read a prefix char, then a single valid
-- identifier char, then commit -- identifier char, then commit
prefixedVariable :: Dialect -> Parser Token prefixedVariable :: Dialect -> Parser Token
prefixedVariable d = try $ choice
[PrefixedVariable <$> (':' <$ char_ ':') <*> identifierString d
]
-- use try because : and @ can be part of other things also
{-
prefixedVariable :: Dialect -> Parser Token
prefixedVariable d = try $ choice prefixedVariable d = try $ choice
[PrefixedVariable <$> char ':' <*> identifierString [PrefixedVariable <$> char ':' <*> identifierString
,guard (diAtIdentifier d) >> ,guard (diAtIdentifier d) >>
@ -572,21 +510,9 @@ prefixedVariable d = try $ choice
,guard (diHashIdentifier d) >> ,guard (diHashIdentifier d) >>
PrefixedVariable <$> char '#' <*> identifierString PrefixedVariable <$> char '#' <*> identifierString
] ]
-}
-------------------------------------- --------------------------------------
symbol :: Dialect -> Parser Token
symbol _ =
Symbol <$> choice
[try $ choice $ map (\a -> string a) multiCharSymbols
,T.singleton <$> satisfy (`elem` singleLetterSymbol)
]
where
singleLetterSymbol = "(),-+*/<>=." :: String
multiCharSymbols = ["!=", "<>", ">=", "<=", "||"]
{- {-
Symbols Symbols
@ -596,7 +522,7 @@ A symbol is an operator, or one of the misc symbols which include:
The postgresql operator syntax allows a huge range of operators The postgresql operator syntax allows a huge range of operators
compared with ansi and other dialects compared with ansi and other dialects
-} -}
{-
symbol :: Dialect -> Parser Token symbol :: Dialect -> Parser Token
symbol d = Symbol <$> choice (concat symbol d = Symbol <$> choice (concat
[dots [dots
@ -610,14 +536,14 @@ symbol d = Symbol <$> choice (concat
else basicAnsiOps else basicAnsiOps
]) ])
where where
dots = [many1 (char '.')] dots = [takeWhile1P (Just "dot") (=='.')]
odbcSymbol = [string "{", string "}"] odbcSymbol = [string "{", string "}"]
postgresExtraSymbols = postgresExtraSymbols =
[try (string ":=") [try (string ":=")
-- parse :: and : and avoid allowing ::: or more -- parse :: and : and avoid allowing ::: or more
,try (string "::" <* notFollowedBy (char ':')) ,try (string "::" <* notFollowedBy (char ':'))
,try (string ":" <* notFollowedBy (char ':'))] ,try (string ":" <* notFollowedBy (char ':'))]
miscSymbol = map (string . (:[])) $ miscSymbol = map (string . T.singleton) $
case () of case () of
_ | diSqlServerSymbols d -> ",;():?" _ | diSqlServerSymbols d -> ",;():?"
| diPostgresSymbols d -> "[],;()" | diPostgresSymbols d -> "[],;()"
@ -629,14 +555,14 @@ symbols can also be part of a single character symbol
-} -}
basicAnsiOps = map (try . string) [">=","<=","!=","<>"] basicAnsiOps = map (try . string) [">=","<=","!=","<>"]
++ map (string . (:[])) "+-^*/%~&<>=" ++ map (string . T.singleton) "+-^*/%~&<>="
++ pipes ++ pipes
pipes = -- what about using many1 (char '|'), then it will pipes = -- what about using many1 (char '|'), then it will
-- fail in the parser? Not sure exactly how -- fail in the parser? Not sure exactly how
-- standalone the lexer should be -- standalone the lexer should be
[char '|' *> [char '|' *>
choice ["||" <$ char '|' <* notFollowedBy (char '|') choice ["||" <$ char '|' <* notFollowedBy (char '|')
,return "|"]] ,pure "|"]]
{- {-
postgresql generalized operators postgresql generalized operators
@ -662,7 +588,7 @@ which allows the last character of a multi character symbol to be + or
- -
-} -}
generalizedPostgresqlOperator :: [Parser String] generalizedPostgresqlOperator :: [Parser Text]
generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars] generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
where where
allOpSymbols = "+-*/<>=~!@#%^&|`?" allOpSymbols = "+-*/<>=~!@#%^&|`?"
@ -674,13 +600,13 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
singlePlusMinus = try $ do singlePlusMinus = try $ do
c <- oneOf "+-" c <- oneOf "+-"
notFollowedBy $ oneOf allOpSymbols notFollowedBy $ oneOf allOpSymbols
return [c] pure $ T.singleton c
-- this is used when we are parsing a potentially multi symbol -- this is used when we are parsing a potentially multi symbol
-- operator and we have alread seen one of the 'exception chars' -- operator and we have alread seen one of the 'exception chars'
-- and so we can end with a + or - -- and so we can end with a + or -
moreOpCharsException = do moreOpCharsException = do
c <- oneOf (filter (`notElem` "-/*") allOpSymbols) c <- oneOf (filter (`notElemChar` "-/*") allOpSymbols)
-- make sure we don't parse a comment starting token -- make sure we don't parse a comment starting token
-- as part of an operator -- as part of an operator
<|> try (char '/' <* notFollowedBy (char '*')) <|> try (char '/' <* notFollowedBy (char '*'))
@ -688,14 +614,14 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
-- and make sure we don't parse a block comment end -- and make sure we don't parse a block comment end
-- as part of another symbol -- as part of another symbol
<|> try (char '*' <* notFollowedBy (char '/')) <|> try (char '*' <* notFollowedBy (char '/'))
(c:) <$> option [] moreOpCharsException T.cons c <$> option "" moreOpCharsException
opMoreChars = choice opMoreChars = choice
[-- parse an exception char, now we can finish with a + - [-- parse an exception char, now we can finish with a + -
(:) T.cons
<$> oneOf exceptionOpSymbols <$> oneOf exceptionOpSymbols
<*> option [] moreOpCharsException <*> option "" moreOpCharsException
,(:) ,T.cons
<$> (-- parse +, make sure it isn't the last symbol <$> (-- parse +, make sure it isn't the last symbol
try (char '+' <* lookAhead (oneOf allOpSymbols)) try (char '+' <* lookAhead (oneOf allOpSymbols))
<|> -- parse -, make sure it isn't the last symbol <|> -- parse -, make sure it isn't the last symbol
@ -709,18 +635,14 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
try (char '*' <* notFollowedBy (char '/')) try (char '*' <* notFollowedBy (char '/'))
<|> -- any other ansi operator symbol <|> -- any other ansi operator symbol
oneOf "<>=") oneOf "<>=")
<*> option [] opMoreChars <*> option "" opMoreChars
] ]
-}
-------------------------------------- --------------------------------------
sqlWhitespace :: Dialect -> Parser Token sqlWhitespace :: Dialect -> Parser Token
sqlWhitespace _ = Whitespace <$> takeWhile1P (Just "whitespace") isSpace <?> "" sqlWhitespace _ = Whitespace <$> takeWhile1P (Just "whitespace") isSpace <?> ""
--sqlWhitespace :: Dialect -> Parser Token
--sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
---------------------------------------------------------------------------- ----------------------------------------------------------------------------
-- parser helpers -- parser helpers
@ -731,25 +653,11 @@ char_ = void . char
string_ :: Text -> Parser () string_ :: Text -> Parser ()
string_ = void . string string_ = void . string
{- oneOf :: [Char] -> Parser Char
startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String oneOf = M.oneOf
startsWith p ps = do
c <- satisfy p
choice [(:) c <$> (takeWhile1 ps)
,return [c]]
takeWhile1 :: (Char -> Bool) -> Parser String notElemChar :: Char -> [Char] -> Bool
takeWhile1 p = many1 (satisfy p) notElemChar a b = a `notElem` (b :: [Char])
takeWhile :: (Char -> Bool) -> Parser String
takeWhile p = many (satisfy p)
takeTill :: (Char -> Bool) -> Parser String
takeTill p = manyTill anyChar (peekSatisfy p)
peekSatisfy :: (Char -> Bool) -> Parser ()
peekSatisfy p = void $ lookAhead (satisfy p)
-}
---------------------------------------------------------------------------- ----------------------------------------------------------------------------
@ -776,8 +684,7 @@ successes. I don't think it succeeds this test at the moment
-- will pretty print then lex back to the same set of tokens. -- will pretty print then lex back to the same set of tokens.
-- Used internally, might be useful for generating SQL via lexical tokens. -- Used internally, might be useful for generating SQL via lexical tokens.
tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool
tokenListWillPrintAndLex = undefined tokenListWillPrintAndLex _ [] = True
{-tokenListWillPrintAndLex _ [] = True
tokenListWillPrintAndLex _ [_] = True tokenListWillPrintAndLex _ [_] = True
tokenListWillPrintAndLex d (a:b:xs) = tokenListWillPrintAndLex d (a:b:xs) =
tokensWillPrintAndLex d a b && tokenListWillPrintAndLex d (b:xs) tokensWillPrintAndLex d a b && tokenListWillPrintAndLex d (b:xs)
@ -791,7 +698,7 @@ followed by = or : makes a different symbol
-} -}
| Symbol ":" <- a | Symbol ":" <- a
, checkFirstBChar (\x -> isIdentifierChar x || x `elem` ":=") = False , checkFirstBChar (\x -> isIdentifierChar x || x `T.elem` ":=") = False
{- {-
two symbols next to eachother will fail if the symbols can combine and two symbols next to eachother will fail if the symbols can combine and
@ -801,7 +708,7 @@ two symbols next to eachother will fail if the symbols can combine and
| diPostgresSymbols d | diPostgresSymbols d
, Symbol a' <- a , Symbol a' <- a
, Symbol b' <- b , Symbol b' <- b
, b' `notElem` ["+", "-"] || or (map (`elem` a') "~!@#%^&|`?") = False , b' `notElem` ["+", "-"] || or (map (`T.elem` a') "~!@#%^&|`?") = False
{- {-
check two adjacent symbols in non postgres where the combination check two adjacent symbols in non postgres where the combination
@ -906,17 +813,13 @@ TODO: not 100% on this always being bad
-- helper function to run a predicate on the -- helper function to run a predicate on the
-- last character of the first token and the first -- last character of the first token and the first
-- character of the second token -- character of the second token
checkBorderChars f checkBorderChars f =
| (_:_) <- prettya case (T.unsnoc prettya, T.uncons prettyb) of
, (fb:_) <- prettyb (Just (_,la), Just (fb,_)) -> f la fb
, la <- last prettya
= f la fb
checkBorderChars _ = False
checkFirstBChar f = case prettyb of
(b':_) -> f b'
_ -> False _ -> False
checkLastAChar f = case prettya of checkFirstBChar f = case T.uncons prettyb of
(_:_) -> f $ last prettya Just (b',_) -> f b'
_ -> False
checkLastAChar f = case T.unsnoc prettya of
Just (_,la) -> f la
_ -> False _ -> False
-}

View file

@ -24,10 +24,8 @@ import Language.SQL.SimpleSQL.Lex
,tokenListWillPrintAndLex ,tokenListWillPrintAndLex
) )
import Language.SQL.SimpleSQL.Dialect
(ansi2011)
import qualified Data.Text as T import qualified Data.Text as T
import Data.Text (Text)
--import Debug.Trace --import Debug.Trace
--import Data.Char (isAlpha) --import Data.Char (isAlpha)
@ -35,12 +33,13 @@ import qualified Data.Text as T
lexerTests :: TestItem lexerTests :: TestItem
lexerTests = Group "lexerTests" $ lexerTests = Group "lexerTests" $
[bootstrapTests{-Group "lexer token tests" [ansiLexerTests [bootstrapTests
,ansiLexerTests
,postgresLexerTests ,postgresLexerTests
,sqlServerLexerTests ,sqlServerLexerTests
,oracleLexerTests ,oracleLexerTests
,mySqlLexerTests ,mySqlLexerTests
,odbcLexerTests]-}] ,odbcLexerTests]
-- quick sanity tests to see something working -- quick sanity tests to see something working
bootstrapTests :: TestItem bootstrapTests :: TestItem
@ -75,8 +74,9 @@ bootstrapTests = Group "bootstrap tests" $
,("1", [SqlNumber "1"]) ,("1", [SqlNumber "1"])
,("42", [SqlNumber "42"]) ,("42", [SqlNumber "42"])
,("$1", [PositionalArg 1]) -- have to fix the dialect handling in the tests
,("$200", [PositionalArg 200]) --,("$1", [PositionalArg 1])
--,("$200", [PositionalArg 200])
,(":test", [PrefixedVariable ':' "test"]) ,(":test", [PrefixedVariable ':' "test"])
@ -84,22 +84,22 @@ bootstrapTests = Group "bootstrap tests" $
["!=", "<>", ">=", "<=", "||"] ["!=", "<>", ">=", "<=", "||"]
++ map T.singleton ("(),-+*/<>=." :: String))) ++ map T.singleton ("(),-+*/<>=." :: String)))
{-
ansiLexerTable :: [(String,[Token])] ansiLexerTable :: [(Text,[Token])]
ansiLexerTable = ansiLexerTable =
-- single char symbols -- single char symbols
map (\s -> ([s],[Symbol [s]])) "+-^*/%~&|?<>[]=,;()" map (\s -> (T.singleton s,[Symbol $ T.singleton s])) "+-^*/%~&|?<>[]=,;()"
-- multi char symbols -- multi char symbols
++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||"] ++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||"]
++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"] ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"]
-- simple identifiers -- simple identifiers
in map (\i -> (i, [Identifier Nothing i])) idens in map (\i -> (i, [Identifier Nothing i])) idens
++ map (\i -> ("\"" ++ i ++ "\"", [Identifier (Just ("\"","\"")) i])) idens <> map (\i -> ("\"" <> i <> "\"", [Identifier (Just ("\"","\"")) i])) idens
-- todo: in order to make lex . pretty id, need to -- todo: in order to make lex . pretty id, need to
-- preserve the case of the u -- preserve the case of the u
++ map (\i -> ("u&\"" ++ i ++ "\"", [Identifier (Just ("u&\"","\"")) i])) idens <> map (\i -> ("u&\"" <> i <> "\"", [Identifier (Just ("u&\"","\"")) i])) idens
-- host param -- host param
++ map (\i -> (':':i, [PrefixedVariable ':' i])) idens <> map (\i -> (T.cons ':' i, [PrefixedVariable ':' i])) idens
) )
-- quoted identifiers with embedded double quotes -- quoted identifiers with embedded double quotes
-- the lexer doesn't unescape the quotes -- the lexer doesn't unescape the quotes
@ -111,7 +111,7 @@ ansiLexerTable =
,("'normalendquote '''", [SqlString "'" "'" "normalendquote ''"]) ,("'normalendquote '''", [SqlString "'" "'" "normalendquote ''"])
,("'\n'", [SqlString "'" "'" "\n"])] ,("'\n'", [SqlString "'" "'" "\n"])]
-- csstrings -- csstrings
++ map (\c -> (c ++ "'test'", [SqlString (c ++ "'") "'" "test"])) ++ map (\c -> (c <> "'test'", [SqlString (c <> "'") "'" "test"]))
["n", "N","b", "B","x", "X", "u&"] ["n", "N","b", "B","x", "X", "u&"]
-- numbers -- numbers
++ [("10", [SqlNumber "10"]) ++ [("10", [SqlNumber "10"])
@ -122,8 +122,8 @@ ansiLexerTable =
,("10.2", [SqlNumber "10.2"]) ,("10.2", [SqlNumber "10.2"])
,("10.2e7", [SqlNumber "10.2e7"])] ,("10.2e7", [SqlNumber "10.2e7"])]
-- whitespace -- whitespace
++ concat [[([a],[Whitespace [a]]) ++ concat [[(T.singleton a,[Whitespace $ T.singleton a])
,([a,b], [Whitespace [a,b]])] ,(T.singleton a <> T.singleton b, [Whitespace (T.singleton a <> T.singleton b)])]
| a <- " \n\t", b <- " \n\t"] | a <- " \n\t", b <- " \n\t"]
-- line comment -- line comment
++ map (\c -> (c, [LineComment c])) ++ map (\c -> (c, [LineComment c]))
@ -134,14 +134,15 @@ ansiLexerTable =
,"/* this *is/ a comment */" ,"/* this *is/ a comment */"
] ]
ansiLexerTests :: TestItem ansiLexerTests :: TestItem
ansiLexerTests = Group "ansiLexerTests" $ ansiLexerTests = Group "ansiLexerTests" $
[Group "ansi lexer token tests" $ [LexTest ansi2011 s t | (s,t) <- ansiLexerTable] [Group "ansi lexer token tests" $ [LexTest ansi2011 s t | (s,t) <- ansiLexerTable]
,Group "ansi generated combination lexer tests" $ ,Group "ansi generated combination lexer tests" $
[ LexTest ansi2011 (s ++ s1) (t ++ t1) [ LexTest ansi2011 (s <> s1) (t <> t1)
| (s,t) <- ansiLexerTable | (s,t) <- ansiLexerTable
, (s1,t1) <- ansiLexerTable , (s1,t1) <- ansiLexerTable
, tokenListWillPrintAndLex ansi2011 $ t ++ t1 , tokenListWillPrintAndLex ansi2011 $ t <> t1
] ]
,Group "ansiadhoclexertests" $ ,Group "ansiadhoclexertests" $
@ -185,10 +186,10 @@ assurance.
postgresLexerTable :: [(String,[Token])] postgresLexerTable :: [(Text,[Token])]
postgresLexerTable = postgresLexerTable =
-- single char symbols -- single char symbols
map (\s -> ([s],[Symbol [s]])) "+-^*/%~&|?<>[]=,;():" map (\s -> (T.singleton s,[Symbol $ T.singleton s])) "+-^*/%~&|?<>[]=,;():"
-- multi char symbols -- multi char symbols
++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||", "::","..",":="] ++ map (\s -> (s,[Symbol s])) [">=","<=","!=","<>","||", "::","..",":="]
-- generic symbols -- generic symbols
@ -196,12 +197,12 @@ postgresLexerTable =
++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"] ++ (let idens = ["a", "_a", "test", "table", "Stuff", "STUFF"]
-- simple identifiers -- simple identifiers
in map (\i -> (i, [Identifier Nothing i])) idens in map (\i -> (i, [Identifier Nothing i])) idens
++ map (\i -> ("\"" ++ i ++ "\"", [Identifier (Just ("\"","\"")) i])) idens ++ map (\i -> ("\"" <> i <> "\"", [Identifier (Just ("\"","\"")) i])) idens
-- todo: in order to make lex . pretty id, need to -- todo: in order to make lex . pretty id, need to
-- preserve the case of the u -- preserve the case of the u
++ map (\i -> ("u&\"" ++ i ++ "\"", [Identifier (Just ("u&\"","\"")) i])) idens ++ map (\i -> ("u&\"" <> i <> "\"", [Identifier (Just ("u&\"","\"")) i])) idens
-- host param -- host param
++ map (\i -> (':':i, [PrefixedVariable ':' i])) idens ++ map (\i -> (T.cons ':' i, [PrefixedVariable ':' i])) idens
) )
-- positional var -- positional var
++ [("$1", [PositionalArg 1])] ++ [("$1", [PositionalArg 1])]
@ -223,7 +224,7 @@ postgresLexerTable =
,("$a$ $$string 3$$ $a$", [SqlString "$a$" "$a$" " $$string 3$$ "]) ,("$a$ $$string 3$$ $a$", [SqlString "$a$" "$a$" " $$string 3$$ "])
] ]
-- csstrings -- csstrings
++ map (\c -> (c ++ "'test'", [SqlString (c ++ "'") "'" "test"])) ++ map (\c -> (c <> "'test'", [SqlString (c <> "'") "'" "test"]))
["n", "N","b", "B","x", "X", "u&", "e", "E"] ["n", "N","b", "B","x", "X", "u&", "e", "E"]
-- numbers -- numbers
++ [("10", [SqlNumber "10"]) ++ [("10", [SqlNumber "10"])
@ -234,8 +235,8 @@ postgresLexerTable =
,("10.2", [SqlNumber "10.2"]) ,("10.2", [SqlNumber "10.2"])
,("10.2e7", [SqlNumber "10.2e7"])] ,("10.2e7", [SqlNumber "10.2e7"])]
-- whitespace -- whitespace
++ concat [[([a],[Whitespace [a]]) ++ concat [[(T.singleton a,[Whitespace $ T.singleton a])
,([a,b], [Whitespace [a,b]])] ,(T.singleton a <> T.singleton b, [Whitespace $ T.singleton a <> T.singleton b])]
| a <- " \n\t", b <- " \n\t"] | a <- " \n\t", b <- " \n\t"]
-- line comment -- line comment
++ map (\c -> (c, [LineComment c])) ++ map (\c -> (c, [LineComment c]))
@ -267,24 +268,24 @@ operators without one of the exception chars
also: do the testing for the ansi compatibility special cases also: do the testing for the ansi compatibility special cases
-} -}
postgresShortOperatorTable :: [(String,[Token])] postgresShortOperatorTable :: [(Text,[Token])]
postgresShortOperatorTable = postgresShortOperatorTable =
[ (x, [Symbol x]) | x <- someValidPostgresOperators 2] [ (x, [Symbol x]) | x <- someValidPostgresOperators 2]
postgresExtraOperatorTable :: [(String,[Token])] postgresExtraOperatorTable :: [(Text,[Token])]
postgresExtraOperatorTable = postgresExtraOperatorTable =
[ (x, [Symbol x]) | x <- someValidPostgresOperators 4] [ (x, [Symbol x]) | x <- someValidPostgresOperators 4]
someValidPostgresOperators :: Int -> [String] someValidPostgresOperators :: Int -> [Text]
someValidPostgresOperators l = someValidPostgresOperators l =
[ x [ x
| n <- [1..l] | n <- [1..l]
, x <- combos "+-*/<>=~!@#%^&|`?" n , x <- combos "+-*/<>=~!@#%^&|`?" n
, not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x) , not ("--" `T.isInfixOf` x || "/*" `T.isInfixOf` x || "*/" `T.isInfixOf` x)
, not (last x `elem` "+-") , not (T.last x `T.elem` "+-")
|| or (map (`elem` x) "~!@#%^&|`?") || or (map (`T.elem` x) "~!@#%^&|`?")
] ]
{- {-
@ -293,13 +294,13 @@ These are postgres operators, which if followed immediately by a + or
the + or -. the + or -.
-} -}
somePostgresOpsWhichWontAddTrailingPlusMinus :: Int -> [String] somePostgresOpsWhichWontAddTrailingPlusMinus :: Int -> [Text]
somePostgresOpsWhichWontAddTrailingPlusMinus l = somePostgresOpsWhichWontAddTrailingPlusMinus l =
[ x [ x
| n <- [1..l] | n <- [1..l]
, x <- combos "+-*/<>=" n , x <- combos "+-*/<>=" n
, not ("--" `isInfixOf` x || "/*" `isInfixOf` x || "*/" `isInfixOf` x) , not ("--" `T.isInfixOf` x || "/*" `T.isInfixOf` x || "*/" `T.isInfixOf` x)
, not (last x `elem` "+-") , not (T.last x `T.elem` "+-")
] ]
@ -310,7 +311,7 @@ postgresLexerTests = Group "postgresLexerTests" $
,Group "postgres generated lexer token tests" $ ,Group "postgres generated lexer token tests" $
[LexTest postgres s t | (s,t) <- postgresShortOperatorTable ++ postgresExtraOperatorTable] [LexTest postgres s t | (s,t) <- postgresShortOperatorTable ++ postgresExtraOperatorTable]
,Group "postgres generated combination lexer tests" $ ,Group "postgres generated combination lexer tests" $
[ LexTest postgres (s ++ s1) (t ++ t1) [ LexTest postgres (s <> s1) (t <> t1)
| (s,t) <- postgresLexerTable ++ postgresShortOperatorTable | (s,t) <- postgresLexerTable ++ postgresShortOperatorTable
, (s1,t1) <- postgresLexerTable ++ postgresShortOperatorTable , (s1,t1) <- postgresLexerTable ++ postgresShortOperatorTable
, tokenListWillPrintAndLex postgres $ t ++ t1 , tokenListWillPrintAndLex postgres $ t ++ t1
@ -344,18 +345,18 @@ postgresLexerTests = Group "postgresLexerTests" $
] ]
where where
edgeCaseCommentOps = edgeCaseCommentOps =
[ (x ++ "/*<test*/", [Symbol x, BlockComment "/*<test*/"]) [ (x <> "/*<test*/", [Symbol x, BlockComment "/*<test*/"])
| x <- eccops | x <- eccops
, not (last x == '*') , not (T.last x == '*')
] ++ ] ++
[ (x ++ "--<test", [Symbol x, LineComment "--<test"]) [ (x <> "--<test", [Symbol x, LineComment "--<test"])
| x <- eccops | x <- eccops
, not (last x == '-') , not (T.last x == '-')
] ]
eccops = someValidPostgresOperators 2 eccops = someValidPostgresOperators 2
edgeCasePlusMinusOps = concat edgeCasePlusMinusOps = concat
[ [ (x ++ "+", [Symbol x, Symbol "+"]) [ [ (x <> "+", [Symbol x, Symbol "+"])
, (x ++ "-", [Symbol x, Symbol "-"]) ] , (x <> "-", [Symbol x, Symbol "-"]) ]
| x <- somePostgresOpsWhichWontAddTrailingPlusMinus 2 | x <- somePostgresOpsWhichWontAddTrailingPlusMinus 2
] ]
edgeCasePlusMinusComments = edgeCasePlusMinusComments =
@ -365,7 +366,6 @@ postgresLexerTests = Group "postgresLexerTests" $
,("+/**/", [Symbol "+", BlockComment "/**/"]) ,("+/**/", [Symbol "+", BlockComment "/**/"])
] ]
sqlServerLexerTests :: TestItem sqlServerLexerTests :: TestItem
sqlServerLexerTests = Group "sqlServerLexTests" $ sqlServerLexerTests = Group "sqlServerLexTests" $
[ LexTest sqlserver s t | (s,t) <- [ LexTest sqlserver s t | (s,t) <-
@ -393,8 +393,6 @@ odbcLexerTests = Group "odbcLexTests" $
++ [LexFails sqlserver {diOdbc = False} "{" ++ [LexFails sqlserver {diOdbc = False} "{"
,LexFails sqlserver {diOdbc = False} "}"] ,LexFails sqlserver {diOdbc = False} "}"]
combos :: [a] -> Int -> [[a]] combos :: [Char] -> Int -> [Text]
combos _ 0 = [[]] combos _ 0 = [T.empty]
combos l n = [ x:tl | x <- l, tl <- combos l (n - 1) ] combos l n = [ T.cons x tl | x <- l, tl <- combos l (n - 1) ]
-}

View file

@ -36,7 +36,6 @@ module Language.SQL.SimpleSQL.SQL2011Queries (sql2011QueryTests) where
import Language.SQL.SimpleSQL.TestTypes import Language.SQL.SimpleSQL.TestTypes
import Language.SQL.SimpleSQL.Syntax import Language.SQL.SimpleSQL.Syntax
import qualified Data.Text as T
import Data.Text (Text) import Data.Text (Text)
sql2011QueryTests :: TestItem sql2011QueryTests :: TestItem

View file

@ -7,8 +7,6 @@ module Language.SQL.SimpleSQL.ScalarExprs (scalarExprTests) where
import Language.SQL.SimpleSQL.TestTypes import Language.SQL.SimpleSQL.TestTypes
import Language.SQL.SimpleSQL.Syntax import Language.SQL.SimpleSQL.Syntax
import qualified Data.Text as T
scalarExprTests :: TestItem scalarExprTests :: TestItem
scalarExprTests = Group "scalarExprTests" scalarExprTests = Group "scalarExprTests"
[literals [literals

View file

@ -58,7 +58,7 @@ testData :: TestItem
testData = testData =
Group "parserTest" Group "parserTest"
[lexerTests [lexerTests
{-,scalarExprTests ,scalarExprTests
,odbcTests ,odbcTests
,queryExprComponentTests ,queryExprComponentTests
,queryExprsTests ,queryExprsTests
@ -76,7 +76,7 @@ testData =
,oracleTests ,oracleTests
,customDialectTests ,customDialectTests
,emptyStatementTests ,emptyStatementTests
,createIndexTests-} ,createIndexTests
] ]
tests :: T.TestTree tests :: T.TestTree