1
Fork 0

reorder the lex file in prep

This commit is contained in:
Jake Wheat 2024-01-09 17:47:04 +00:00
parent 8c05ffd0b6
commit d80796b1dd

View file

@ -16,6 +16,42 @@ don't do nearly as comprehensive testing on the syntax level, we still
have a relatively high assurance of the low level of bugs. This is have a relatively high assurance of the low level of bugs. This is
much more difficult to get parity with when testing the syntax parser much more difficult to get parity with when testing the syntax parser
directly without the separately testing lexing stage. directly without the separately testing lexing stage.
TODO:
make the tokenswill print more dialect accurate. Maybe add symbol
chars and identifier chars to the dialect definition and use them from
here
start adding negative / different parse dialect tests
add token tables and tests for oracle, sql server
review existing tables
look for refactoring opportunities, especially the token
generation tables in the tests
do some user documentation on lexing, and lexing/dialects
start thinking about a more separated design for the dialect handling
lexing tests are starting to take a really long time, so split the
tests so it is much easier to run all the tests except the lexing
tests which only need to be run when working on the lexer (which
should be relatively uncommon), or doing a commit or finishing off a
series of commits,
start writing the error message tests:
generate/write a large number of syntax errors
create a table with the source and the error message
try to compare some different versions of code to compare the
quality of the error messages by hand
get this checked in so improvements and regressions in the error
message quality can be tracked a little more easily (although it will
still be manual)
try again to add annotation to the ast
-} -}
-- | Lexer for SQL. -- | Lexer for SQL.
@ -49,6 +85,9 @@ import Prelude hiding (takeWhile)
import Text.Parsec.String (Parser) import Text.Parsec.String (Parser)
import Data.Maybe import Data.Maybe
------------------------------------------------------------------------------
-- syntax
-- | Represents a lexed token -- | Represents a lexed token
data Token data Token
@ -58,63 +97,37 @@ data Token
-- * single char symbols: * + - < > ^ / % ~ & | ? ( ) [ ] , ; ( ) -- * single char symbols: * + - < > ^ / % ~ & | ? ( ) [ ] , ; ( )
-- --
= Symbol String = Symbol String
-- | This is an identifier or keyword. The first field is -- | This is an identifier or keyword. The first field is
-- the quotes used, or nothing if no quotes were used. The quotes -- the quotes used, or nothing if no quotes were used. The quotes
-- can be " or u& or something dialect specific like [] -- can be " or u& or something dialect specific like []
| Identifier (Maybe (String,String)) String | Identifier (Maybe (String,String)) String
-- | This is a prefixed variable symbol, such as :var, @var or #var -- | This is a prefixed variable symbol, such as :var, @var or #var
-- (only :var is used in ansi dialect) -- (only :var is used in ansi dialect)
| PrefixedVariable Char String | PrefixedVariable Char String
-- | This is a positional arg identifier e.g. $1 -- | This is a positional arg identifier e.g. $1
| PositionalArg Int | PositionalArg Int
-- | This is a string literal. The first two fields are the -- -- | This is a string literal. The first two fields are the --
-- start and end quotes, which are usually both ', but can be -- start and end quotes, which are usually both ', but can be
-- the character set (one of nNbBxX, or u&, U&), or a dialect -- the character set (one of nNbBxX, or u&, U&), or a dialect
-- specific string quoting (such as $$ in postgres) -- specific string quoting (such as $$ in postgres)
| SqlString String String String | SqlString String String String
-- | A number literal (integral or otherwise), stored in original format -- | A number literal (integral or otherwise), stored in original format
-- unchanged -- unchanged
| SqlNumber String | SqlNumber String
-- | Whitespace, one or more of space, tab or newline. -- | Whitespace, one or more of space, tab or newline.
| Whitespace String | Whitespace String
-- | A commented line using --, contains every character starting with the -- | A commented line using --, contains every character starting with the
-- \'--\' and including the terminating newline character if there is one -- \'--\' and including the terminating newline character if there is one
-- - this will be missing if the last line in the source is a line comment -- - this will be missing if the last line in the source is a line comment
-- with no trailing newline -- with no trailing newline
| LineComment String | LineComment String
-- | A block comment, \/* stuff *\/, includes the comment delimiters -- | A block comment, \/* stuff *\/, includes the comment delimiters
| BlockComment String | BlockComment String
deriving (Eq,Show) deriving (Eq,Show)
------------------------------------------------------------------------------
-- main api functions
-- | Pretty printing, if you lex a bunch of tokens, then pretty
-- print them, should should get back exactly the same string
prettyToken :: Dialect -> Token -> String
prettyToken _ (Symbol s) = s
prettyToken _ (Identifier Nothing t) = t
prettyToken _ (Identifier (Just (q1,q2)) t) = q1 ++ t ++ q2
prettyToken _ (PrefixedVariable c p) = c:p
prettyToken _ (PositionalArg p) = '$':show p
prettyToken _ (SqlString s e t) = s ++ t ++ e
prettyToken _ (SqlNumber r) = r
prettyToken _ (Whitespace t) = t
prettyToken _ (LineComment l) = l
prettyToken _ (BlockComment c) = c
prettyTokens :: Dialect -> [Token] -> String
prettyTokens d ts = concat $ map (prettyToken d) ts
-- TODO: try to make all parsers applicative only
-- | Lex some SQL to a list of tokens. -- | Lex some SQL to a list of tokens.
lexSQL :: Dialect lexSQL :: Dialect
@ -138,6 +151,31 @@ lexSQL dialect fn' p src =
. flip setSourceColumn c) getPosition . flip setSourceColumn c) getPosition
>>= setPosition >>= setPosition
------------------------------------------------------------------------------
-- pretty printing
-- | Pretty printing, if you lex a bunch of tokens, then pretty
-- print them, should should get back exactly the same string
prettyToken :: Dialect -> Token -> String
prettyToken _ (Symbol s) = s
prettyToken _ (Identifier Nothing t) = t
prettyToken _ (Identifier (Just (q1,q2)) t) = q1 ++ t ++ q2
prettyToken _ (PrefixedVariable c p) = c:p
prettyToken _ (PositionalArg p) = '$':show p
prettyToken _ (SqlString s e t) = s ++ t ++ e
prettyToken _ (SqlNumber r) = r
prettyToken _ (Whitespace t) = t
prettyToken _ (LineComment l) = l
prettyToken _ (BlockComment c) = c
prettyTokens :: Dialect -> [Token] -> String
prettyTokens d ts = concat $ map (prettyToken d) ts
------------------------------------------------------------------------------
-- token parsers
-- | parser for a sql token -- | parser for a sql token
sqlToken :: Dialect -> Parser ((String,Int,Int),Token) sqlToken :: Dialect -> Parser ((String,Int,Int),Token)
sqlToken d = do sqlToken d = do
@ -164,77 +202,7 @@ this is also tried before symbol (a .1 will be parsed as a number, but
,symbol d ,symbol d
,sqlWhitespace d] ,sqlWhitespace d]
{- --------------------------------------
Parses identifiers:
simple_identifier_23
u&"unicode quoted identifier"
"quoted identifier"
"quoted identifier "" with double quote char"
`mysql quoted identifier`
-}
identifier :: Dialect -> Parser Token
identifier d =
choice
[quotedIden
,unicodeQuotedIden
,regularIden
,guard (diBackquotedIden d) >> mySqlQuotedIden
,guard (diSquareBracketQuotedIden d) >> sqlServerQuotedIden
]
where
regularIden = Identifier Nothing <$> identifierString
quotedIden = Identifier (Just ("\"","\"")) <$> qidenPart
mySqlQuotedIden = Identifier (Just ("`","`"))
<$> (char '`' *> takeWhile1 (/='`') <* char '`')
sqlServerQuotedIden = Identifier (Just ("[","]"))
<$> (char '[' *> takeWhile1 (`notElem` "[]") <* char ']')
-- try is used here to avoid a conflict with identifiers
-- and quoted strings which also start with a 'u'
unicodeQuotedIden = Identifier
<$> (f <$> try (oneOf "uU" <* string "&"))
<*> qidenPart
where f x = Just (x: "&\"", "\"")
qidenPart = char '"' *> qidenSuffix ""
qidenSuffix t = do
s <- takeTill (=='"')
void $ char '"'
-- deal with "" as literal double quote character
choice [do
void $ char '"'
qidenSuffix $ concat [t,s,"\"\""]
,return $ concat [t,s]]
-- This parses a valid identifier without quotes.
identifierString :: Parser String
identifierString =
startsWith (\c -> c == '_' || isAlpha c) isIdentifierChar
-- this can be moved to the dialect at some point
isIdentifierChar :: Char -> Bool
isIdentifierChar c = c == '_' || isAlphaNum c
-- use try because : and @ can be part of other things also
prefixedVariable :: Dialect -> Parser Token
prefixedVariable d = try $ choice
[PrefixedVariable <$> char ':' <*> identifierString
,guard (diAtIdentifier d) >>
PrefixedVariable <$> char '@' <*> identifierString
,guard (diHashIdentifier d) >>
PrefixedVariable <$> char '#' <*> identifierString
]
positionalArg :: Dialect -> Parser Token
positionalArg d =
guard (diPositionalArg d) >>
-- use try to avoid ambiguities with other syntax which starts with dollar
PositionalArg <$> try (char '$' *> (read <$> many1 digit))
{- {-
Parse a SQL string. Examples: Parse a SQL string. Examples:
@ -292,6 +260,125 @@ sqlString d = dollarString <|> csString <|> normalString
++ [string "u&'" ++ [string "u&'"
,string "U&'"] ,string "U&'"]
--------------------------------------
{-
Parses identifiers:
simple_identifier_23
u&"unicode quoted identifier"
"quoted identifier"
"quoted identifier "" with double quote char"
`mysql quoted identifier`
-}
identifier :: Dialect -> Parser Token
identifier d =
choice
[quotedIden
,unicodeQuotedIden
,regularIden
,guard (diBackquotedIden d) >> mySqlQuotedIden
,guard (diSquareBracketQuotedIden d) >> sqlServerQuotedIden
]
where
regularIden = Identifier Nothing <$> identifierString
quotedIden = Identifier (Just ("\"","\"")) <$> qidenPart
mySqlQuotedIden = Identifier (Just ("`","`"))
<$> (char '`' *> takeWhile1 (/='`') <* char '`')
sqlServerQuotedIden = Identifier (Just ("[","]"))
<$> (char '[' *> takeWhile1 (`notElem` "[]") <* char ']')
-- try is used here to avoid a conflict with identifiers
-- and quoted strings which also start with a 'u'
unicodeQuotedIden = Identifier
<$> (f <$> try (oneOf "uU" <* string "&"))
<*> qidenPart
where f x = Just (x: "&\"", "\"")
qidenPart = char '"' *> qidenSuffix ""
qidenSuffix t = do
s <- takeTill (=='"')
void $ char '"'
-- deal with "" as literal double quote character
choice [do
void $ char '"'
qidenSuffix $ concat [t,s,"\"\""]
,return $ concat [t,s]]
-- This parses a valid identifier without quotes.
identifierString :: Parser String
identifierString =
startsWith (\c -> c == '_' || isAlpha c) isIdentifierChar
-- this can be moved to the dialect at some point
isIdentifierChar :: Char -> Bool
isIdentifierChar c = c == '_' || isAlphaNum c
--------------------------------------
lineComment :: Dialect -> Parser Token
lineComment _ =
(\s -> LineComment $ concat ["--",s]) <$>
-- try is used here in case we see a - symbol
-- once we read two -- then we commit to the comment token
(try (string "--") *> (
-- todo: there must be a better way to do this
conc <$> manyTill anyChar (lookAhead lineCommentEnd) <*> lineCommentEnd))
where
conc a Nothing = a
conc a (Just b) = a ++ b
lineCommentEnd =
Just "\n" <$ char '\n'
<|> Nothing <$ eof
--------------------------------------
{-
Try is used in the block comment for the two symbol bits because we
want to backtrack if we read the first symbol but the second symbol
isn't there.
-}
blockComment :: Dialect -> Parser Token
blockComment _ =
(\s -> BlockComment $ concat ["/*",s]) <$>
(try (string "/*") *> commentSuffix 0)
where
commentSuffix :: Int -> Parser String
commentSuffix n = do
-- read until a possible end comment or nested comment
x <- takeWhile (\e -> e /= '/' && e /= '*')
choice [-- close comment: if the nesting is 0, done
-- otherwise recurse on commentSuffix
try (string "*/") *> let t = concat [x,"*/"]
in if n == 0
then return t
else (\s -> concat [t,s]) <$> commentSuffix (n - 1)
-- nested comment, recurse
,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
-- not an end comment or nested comment, continue
,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
{-
This is to improve user experience: provide an error if we see */
outside a comment. This could potentially break postgres ops with */
in them (which is a stupid thing to do). In other cases, the user
should write * / instead (I can't think of any cases when this would
be valid syntax though).
-}
dontParseEndBlockComment :: Dialect -> Parser Token
dontParseEndBlockComment _ =
-- don't use try, then it should commit to the error
try (string "*/") *> fail "comment end without comment start"
--------------------------------------
{- {-
numbers numbers
@ -341,6 +428,30 @@ sqlNumber d =
sInt = (++) <$> option "" (string "+" <|> string "-") <*> int sInt = (++) <$> option "" (string "+" <|> string "-") <*> int
pp = (<$$> (++)) pp = (<$$> (++))
--------------------------------------
positionalArg :: Dialect -> Parser Token
positionalArg d =
guard (diPositionalArg d) >>
-- use try to avoid ambiguities with other syntax which starts with dollar
PositionalArg <$> try (char '$' *> (read <$> many1 digit))
--------------------------------------
-- use try because : and @ can be part of other things also
prefixedVariable :: Dialect -> Parser Token
prefixedVariable d = try $ choice
[PrefixedVariable <$> char ':' <*> identifierString
,guard (diAtIdentifier d) >>
PrefixedVariable <$> char '@' <*> identifierString
,guard (diHashIdentifier d) >>
PrefixedVariable <$> char '#' <*> identifierString
]
--------------------------------------
{- {-
Symbols Symbols
@ -466,66 +577,14 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
<*> option [] opMoreChars <*> option [] opMoreChars
] ]
--------------------------------------
sqlWhitespace :: Dialect -> Parser Token sqlWhitespace :: Dialect -> Parser Token
sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace) sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
lineComment :: Dialect -> Parser Token ----------------------------------------------------------------------------
lineComment _ =
(\s -> LineComment $ concat ["--",s]) <$>
-- try is used here in case we see a - symbol
-- once we read two -- then we commit to the comment token
(try (string "--") *> (
-- todo: there must be a better way to do this
conc <$> manyTill anyChar (lookAhead lineCommentEnd) <*> lineCommentEnd))
where
conc a Nothing = a
conc a (Just b) = a ++ b
lineCommentEnd =
Just "\n" <$ char '\n'
<|> Nothing <$ eof
{- -- parser helpers
Try is used in the block comment for the two symbol bits because we
want to backtrack if we read the first symbol but the second symbol
isn't there.
-}
blockComment :: Dialect -> Parser Token
blockComment _ =
(\s -> BlockComment $ concat ["/*",s]) <$>
(try (string "/*") *> commentSuffix 0)
where
commentSuffix :: Int -> Parser String
commentSuffix n = do
-- read until a possible end comment or nested comment
x <- takeWhile (\e -> e /= '/' && e /= '*')
choice [-- close comment: if the nesting is 0, done
-- otherwise recurse on commentSuffix
try (string "*/") *> let t = concat [x,"*/"]
in if n == 0
then return t
else (\s -> concat [t,s]) <$> commentSuffix (n - 1)
-- nested comment, recurse
,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
-- not an end comment or nested comment, continue
,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
{-
This is to improve user experience: provide an error if we see */
outside a comment. This could potentially break postgres ops with */
in them (which is a stupid thing to do). In other cases, the user
should write * / instead (I can't think of any cases when this would
be valid syntax though).
-}
dontParseEndBlockComment :: Dialect -> Parser Token
dontParseEndBlockComment _ =
-- don't use try, then it should commit to the error
try (string "*/") *> fail "comment end without comment start"
-- Some helper combinators
startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
startsWith p ps = do startsWith p ps = do
@ -545,6 +604,9 @@ takeTill p = manyTill anyChar (peekSatisfy p)
peekSatisfy :: (Char -> Bool) -> Parser () peekSatisfy :: (Char -> Bool) -> Parser ()
peekSatisfy p = void $ lookAhead (satisfy p) peekSatisfy p = void $ lookAhead (satisfy p)
----------------------------------------------------------------------------
{- {-
This utility function will accurately report if the two tokens are This utility function will accurately report if the two tokens are
pretty printed, if they should lex back to the same two tokens. This pretty printed, if they should lex back to the same two tokens. This
@ -708,44 +770,3 @@ TODO: not 100% on this always being bad
checkLastAChar f = case prettya of checkLastAChar f = case prettya of
(_:_) -> f $ last prettya (_:_) -> f $ last prettya
_ -> False _ -> False
{-
TODO:
make the tokenswill print more dialect accurate. Maybe add symbol
chars and identifier chars to the dialect definition and use them from
here
start adding negative / different parse dialect tests
add token tables and tests for oracle, sql server
review existing tables
look for refactoring opportunities, especially the token
generation tables in the tests
do some user documentation on lexing, and lexing/dialects
start thinking about a more separated design for the dialect handling
lexing tests are starting to take a really long time, so split the
tests so it is much easier to run all the tests except the lexing
tests which only need to be run when working on the lexer (which
should be relatively uncommon), or doing a commit or finishing off a
series of commits,
start writing the error message tests:
generate/write a large number of syntax errors
create a table with the source and the error message
try to compare some different versions of code to compare the
quality of the error messages by hand
get this checked in so improvements and regressions in the error
message quality can be tracked a little more easily (although it will
still be manual)
try again to add annotation to the ast
-}