reorder the lex file in prep

2024-01-09 17:47:04 +00:00 · 2024-01-09 17:47:04 +00:00 · d80796b1dd
commit d80796b1dd
parent 8c05ffd0b6
1 changed files with 217 additions and 196 deletions
--- a/Language/SQL/SimpleSQL/Lex.hs
+++ b/Language/SQL/SimpleSQL/Lex.hs
@ -16,6 +16,42 @@ don't do nearly as comprehensive testing on the syntax level, we still
 have a relatively high assurance of the low level of bugs. This is
 much more difficult to get parity with when testing the syntax parser
 directly without the separately testing lexing stage.
 TODO:
 make the tokenswill print more dialect accurate. Maybe add symbol
  chars and identifier chars to the dialect definition and use them from
  here
 start adding negative / different parse dialect tests
 add token tables and tests for oracle, sql server
 review existing tables
 look for refactoring opportunities, especially the token
 generation tables in the tests
 do some user documentation on lexing, and lexing/dialects
 start thinking about a more separated design for the dialect handling
 lexing tests are starting to take a really long time, so split the
 tests so it is much easier to run all the tests except the lexing
 tests which only need to be run when working on the lexer (which
 should be relatively uncommon), or doing a commit or finishing off a
 series of commits,
 start writing the error message tests:
  generate/write a large number of syntax errors
  create a table with the source and the error message
  try to compare some different versions of code to compare the
    quality of the error messages by hand
  get this checked in so improvements and regressions in the error
    message quality can be tracked a little more easily (although it will
    still be manual)
 try again to add annotation to the ast
 -}
 -- | Lexer for SQL.
@ -49,6 +85,9 @@ import Prelude hiding (takeWhile)
 import Text.Parsec.String (Parser)
 import Data.Maybe
 ------------------------------------------------------------------------------
 -- syntax
 -- | Represents a lexed token
 data Token
@ -58,63 +97,37 @@ data Token
    -- * single char symbols: * + -  < >  ^ / %  ~ & | ? ( ) [ ] , ; ( )
    --
    = Symbol String
    -- | This is an identifier or keyword. The first field is
    -- the quotes used, or nothing if no quotes were used. The quotes
    -- can be " or u& or something dialect specific like []
    | Identifier (Maybe (String,String)) String
    -- | This is a prefixed variable symbol, such as :var, @var or #var
    -- (only :var is used in ansi dialect)
    | PrefixedVariable Char String
    -- | This is a positional arg identifier e.g. $1
    | PositionalArg Int
    -- | This is a string literal. The first two fields are the --
    -- start and end quotes, which are usually both ', but can be
    -- the character set (one of nNbBxX, or u&, U&), or a dialect
    -- specific string quoting (such as $$ in postgres)
    | SqlString String String String
    -- | A number literal (integral or otherwise), stored in original format
    -- unchanged
    | SqlNumber String
    -- | Whitespace, one or more of space, tab or newline.
    | Whitespace String
    -- | A commented line using --, contains every character starting with the
    -- \'--\' and including the terminating newline character if there is one
    -- - this will be missing if the last line in the source is a line comment
    -- with no trailing newline
    | LineComment String
    -- | A block comment, \/* stuff *\/, includes the comment delimiters
    | BlockComment String
      deriving (Eq,Show)
 ------------------------------------------------------------------------------
-
+-- main api functions
 -- | Pretty printing, if you lex a bunch of tokens, then pretty
 -- print them, should should get back exactly the same string
 prettyToken :: Dialect -> Token -> String
 prettyToken _ (Symbol s) = s
 prettyToken _ (Identifier Nothing t) = t
 prettyToken _ (Identifier (Just (q1,q2)) t) = q1 ++ t ++ q2
 prettyToken _ (PrefixedVariable c p) = c:p
 prettyToken _ (PositionalArg p) = '$':show p
 prettyToken _ (SqlString s e t) = s ++ t ++ e
 prettyToken _ (SqlNumber r) = r
 prettyToken _ (Whitespace t) = t
 prettyToken _ (LineComment l) = l
 prettyToken _ (BlockComment c) = c
 prettyTokens :: Dialect -> [Token] -> String
 prettyTokens d ts = concat $ map (prettyToken d) ts
 -- TODO: try to make all parsers applicative only
 -- | Lex some SQL to a list of tokens.
 lexSQL :: Dialect
@ -138,6 +151,31 @@ lexSQL dialect fn' p src =
               . flip setSourceColumn c) getPosition
          >>= setPosition
 ------------------------------------------------------------------------------
 -- pretty printing
 -- | Pretty printing, if you lex a bunch of tokens, then pretty
 -- print them, should should get back exactly the same string
 prettyToken :: Dialect -> Token -> String
 prettyToken _ (Symbol s) = s
 prettyToken _ (Identifier Nothing t) = t
 prettyToken _ (Identifier (Just (q1,q2)) t) = q1 ++ t ++ q2
 prettyToken _ (PrefixedVariable c p) = c:p
 prettyToken _ (PositionalArg p) = '$':show p
 prettyToken _ (SqlString s e t) = s ++ t ++ e
 prettyToken _ (SqlNumber r) = r
 prettyToken _ (Whitespace t) = t
 prettyToken _ (LineComment l) = l
 prettyToken _ (BlockComment c) = c
 prettyTokens :: Dialect -> [Token] -> String
 prettyTokens d ts = concat $ map (prettyToken d) ts
 ------------------------------------------------------------------------------
 -- token parsers
 -- | parser for a sql token
 sqlToken :: Dialect -> Parser ((String,Int,Int),Token)
 sqlToken d = do
@ -164,77 +202,7 @@ this is also tried before symbol (a .1 will be parsed as a number, but
                    ,symbol d
                    ,sqlWhitespace d]
-{-
+--------------------------------------
 Parses identifiers:
 simple_identifier_23
 u&"unicode quoted identifier"
 "quoted identifier"
 "quoted identifier "" with double quote char"
 `mysql quoted identifier`
 -}
 identifier :: Dialect -> Parser Token
 identifier d =
    choice
    [quotedIden
    ,unicodeQuotedIden
    ,regularIden
    ,guard (diBackquotedIden d) >> mySqlQuotedIden
    ,guard (diSquareBracketQuotedIden d) >> sqlServerQuotedIden
    ]
  where
    regularIden = Identifier Nothing <$> identifierString
    quotedIden = Identifier (Just ("\"","\"")) <$> qidenPart
    mySqlQuotedIden = Identifier (Just ("`","`"))
                      <$> (char '`' *> takeWhile1 (/='`') <* char '`')
    sqlServerQuotedIden = Identifier (Just ("[","]"))
                          <$> (char '[' *> takeWhile1 (`notElem` "[]") <* char ']')
    -- try is used here to avoid a conflict with identifiers
    -- and quoted strings which also start with a 'u'
    unicodeQuotedIden = Identifier
                        <$> (f <$> try (oneOf "uU" <* string "&"))
                        <*> qidenPart
      where f x = Just (x: "&\"", "\"")
    qidenPart = char '"' *> qidenSuffix ""
    qidenSuffix t = do
        s <- takeTill (=='"')
        void $ char '"'
        -- deal with "" as literal double quote character
        choice [do
                void $ char '"'
                qidenSuffix $ concat [t,s,"\"\""]
               ,return $ concat [t,s]]
 -- This parses a valid identifier without quotes.
 identifierString :: Parser String
 identifierString =
    startsWith (\c -> c == '_' || isAlpha c) isIdentifierChar
 -- this can be moved to the dialect at some point
 isIdentifierChar :: Char -> Bool
 isIdentifierChar c = c == '_' || isAlphaNum c
 -- use try because : and @ can be part of other things also
 prefixedVariable :: Dialect -> Parser Token
 prefixedVariable  d = try $ choice
    [PrefixedVariable <$> char ':' <*> identifierString
    ,guard (diAtIdentifier d) >>
     PrefixedVariable <$> char '@' <*> identifierString
    ,guard (diHashIdentifier d) >>
     PrefixedVariable <$> char '#' <*> identifierString
    ]
 positionalArg :: Dialect -> Parser Token
 positionalArg d =
  guard (diPositionalArg d) >>
  -- use try to avoid ambiguities with other syntax which starts with dollar
  PositionalArg <$> try (char '$' *> (read <$> many1 digit))
 {-
 Parse a SQL string. Examples:
@ -292,6 +260,125 @@ sqlString d = dollarString <|> csString <|> normalString
                  ++ [string "u&'"
                     ,string "U&'"]
 --------------------------------------
 {-
 Parses identifiers:
 simple_identifier_23
 u&"unicode quoted identifier"
 "quoted identifier"
 "quoted identifier "" with double quote char"
 `mysql quoted identifier`
 -}
 identifier :: Dialect -> Parser Token
 identifier d =
    choice
    [quotedIden
    ,unicodeQuotedIden
    ,regularIden
    ,guard (diBackquotedIden d) >> mySqlQuotedIden
    ,guard (diSquareBracketQuotedIden d) >> sqlServerQuotedIden
    ]
  where
    regularIden = Identifier Nothing <$> identifierString
    quotedIden = Identifier (Just ("\"","\"")) <$> qidenPart
    mySqlQuotedIden = Identifier (Just ("`","`"))
                      <$> (char '`' *> takeWhile1 (/='`') <* char '`')
    sqlServerQuotedIden = Identifier (Just ("[","]"))
                          <$> (char '[' *> takeWhile1 (`notElem` "[]") <* char ']')
    -- try is used here to avoid a conflict with identifiers
    -- and quoted strings which also start with a 'u'
    unicodeQuotedIden = Identifier
                        <$> (f <$> try (oneOf "uU" <* string "&"))
                        <*> qidenPart
      where f x = Just (x: "&\"", "\"")
    qidenPart = char '"' *> qidenSuffix ""
    qidenSuffix t = do
        s <- takeTill (=='"')
        void $ char '"'
        -- deal with "" as literal double quote character
        choice [do
                void $ char '"'
                qidenSuffix $ concat [t,s,"\"\""]
               ,return $ concat [t,s]]
 -- This parses a valid identifier without quotes.
 identifierString :: Parser String
 identifierString =
    startsWith (\c -> c == '_' || isAlpha c) isIdentifierChar
 -- this can be moved to the dialect at some point
 isIdentifierChar :: Char -> Bool
 isIdentifierChar c = c == '_' || isAlphaNum c
 --------------------------------------
 lineComment :: Dialect -> Parser Token
 lineComment _ =
    (\s -> LineComment $ concat ["--",s]) <$>
    -- try is used here in case we see a - symbol
    -- once we read two -- then we commit to the comment token
    (try (string "--") *> (
        -- todo: there must be a better way to do this
     conc <$> manyTill anyChar (lookAhead lineCommentEnd) <*> lineCommentEnd))
  where
    conc a Nothing = a
    conc a (Just b) = a ++ b
    lineCommentEnd =
        Just "\n" <$ char '\n'
        <|> Nothing <$ eof
 --------------------------------------
 {-
 Try is used in the block comment for the two symbol bits because we
 want to backtrack if we read the first symbol but the second symbol
 isn't there.
 -}
 blockComment :: Dialect -> Parser Token
 blockComment _ =
    (\s -> BlockComment $ concat ["/*",s]) <$>
    (try (string "/*") *> commentSuffix 0)
  where
    commentSuffix :: Int -> Parser String
    commentSuffix n = do
      -- read until a possible end comment or nested comment
      x <- takeWhile (\e -> e /= '/' && e /= '*')
      choice [-- close comment: if the nesting is 0, done
              -- otherwise recurse on commentSuffix
              try (string "*/") *> let t = concat [x,"*/"]
                                   in if n == 0
                                      then return t
                                      else (\s -> concat [t,s]) <$> commentSuffix (n - 1)
              -- nested comment, recurse
             ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
              -- not an end comment or nested comment, continue
             ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
 {-
 This is to improve user experience: provide an error if we see */
 outside a comment. This could potentially break postgres ops with */
 in them (which is a stupid thing to do). In other cases, the user
 should write * / instead (I can't think of any cases when this would
 be valid syntax though).
 -}
 dontParseEndBlockComment :: Dialect -> Parser Token
 dontParseEndBlockComment _ =
    -- don't use try, then it should commit to the error
    try (string "*/") *> fail "comment end without comment start"
 --------------------------------------
 {-
 numbers
@ -341,6 +428,30 @@ sqlNumber d =
    sInt = (++) <$> option "" (string "+" <|> string "-") <*> int
    pp = (<$$> (++))
 --------------------------------------
 positionalArg :: Dialect -> Parser Token
 positionalArg d =
  guard (diPositionalArg d) >>
  -- use try to avoid ambiguities with other syntax which starts with dollar
  PositionalArg <$> try (char '$' *> (read <$> many1 digit))
 --------------------------------------
 -- use try because : and @ can be part of other things also
 prefixedVariable :: Dialect -> Parser Token
 prefixedVariable  d = try $ choice
    [PrefixedVariable <$> char ':' <*> identifierString
    ,guard (diAtIdentifier d) >>
     PrefixedVariable <$> char '@' <*> identifierString
    ,guard (diHashIdentifier d) >>
     PrefixedVariable <$> char '#' <*> identifierString
    ]
 --------------------------------------
 {-
 Symbols
@ -466,66 +577,14 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
        <*> option [] opMoreChars
       ]
 --------------------------------------
 sqlWhitespace :: Dialect -> Parser Token
 sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
-lineComment :: Dialect -> Parser Token
+----------------------------------------------------------------------------
 lineComment _ =
    (\s -> LineComment $ concat ["--",s]) <$>
    -- try is used here in case we see a - symbol
    -- once we read two -- then we commit to the comment token
    (try (string "--") *> (
        -- todo: there must be a better way to do this
     conc <$> manyTill anyChar (lookAhead lineCommentEnd) <*> lineCommentEnd))
  where
    conc a Nothing = a
    conc a (Just b) = a ++ b
    lineCommentEnd =
        Just "\n" <$ char '\n'
        <|> Nothing <$ eof
-{-
+-- parser helpers
 Try is used in the block comment for the two symbol bits because we
 want to backtrack if we read the first symbol but the second symbol
 isn't there.
 -}
 blockComment :: Dialect -> Parser Token
 blockComment _ =
    (\s -> BlockComment $ concat ["/*",s]) <$>
    (try (string "/*") *> commentSuffix 0)
  where
    commentSuffix :: Int -> Parser String
    commentSuffix n = do
      -- read until a possible end comment or nested comment
      x <- takeWhile (\e -> e /= '/' && e /= '*')
      choice [-- close comment: if the nesting is 0, done
              -- otherwise recurse on commentSuffix
              try (string "*/") *> let t = concat [x,"*/"]
                                   in if n == 0
                                      then return t
                                      else (\s -> concat [t,s]) <$> commentSuffix (n - 1)
              -- nested comment, recurse
             ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
              -- not an end comment or nested comment, continue
             ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
 {-
 This is to improve user experience: provide an error if we see */
 outside a comment. This could potentially break postgres ops with */
 in them (which is a stupid thing to do). In other cases, the user
 should write * / instead (I can't think of any cases when this would
 be valid syntax though).
 -}
 dontParseEndBlockComment :: Dialect -> Parser Token
 dontParseEndBlockComment _ =
    -- don't use try, then it should commit to the error
    try (string "*/") *> fail "comment end without comment start"
 -- Some helper combinators
 startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
 startsWith p ps = do
@ -545,6 +604,9 @@ takeTill p = manyTill anyChar (peekSatisfy p)
 peekSatisfy :: (Char -> Bool) -> Parser ()
 peekSatisfy p = void $ lookAhead (satisfy p)
 ----------------------------------------------------------------------------
 {-
 This utility function will accurately report if the two tokens are
 pretty printed, if they should lex back to the same two tokens. This
@ -708,44 +770,3 @@ TODO: not 100% on this always being bad
    checkLastAChar f = case prettya of
                          (_:_) -> f $ last prettya
                          _ -> False
 {-
 TODO:
 make the tokenswill print more dialect accurate. Maybe add symbol
  chars and identifier chars to the dialect definition and use them from
  here
 start adding negative / different parse dialect tests
 add token tables and tests for oracle, sql server
 review existing tables
 look for refactoring opportunities, especially the token
 generation tables in the tests
 do some user documentation on lexing, and lexing/dialects
 start thinking about a more separated design for the dialect handling
 lexing tests are starting to take a really long time, so split the
 tests so it is much easier to run all the tests except the lexing
 tests which only need to be run when working on the lexer (which
 should be relatively uncommon), or doing a commit or finishing off a
 series of commits,
 start writing the error message tests:
  generate/write a large number of syntax errors
  create a table with the source and the error message
  try to compare some different versions of code to compare the
    quality of the error messages by hand
  get this checked in so improvements and regressions in the error
    message quality can be tracked a little more easily (although it will
    still be manual)
 try again to add annotation to the ast
 -}