reorder the lex file in prep

2024-01-09 17:47:04 +00:00 · 2024-01-09 17:47:04 +00:00 · d80796b1dd
commit d80796b1dd
parent 8c05ffd0b6
1 changed files with 217 additions and 196 deletions
--- a/Language/SQL/SimpleSQL/Lex.hs
+++ b/Language/SQL/SimpleSQL/Lex.hs
@ -16,6 +16,42 @@ don't do nearly as comprehensive testing on the syntax level, we still
 have a relatively high assurance of the low level of bugs. This is
 much more difficult to get parity with when testing the syntax parser
 directly without the separately testing lexing stage.
+
+TODO:
+
+make the tokenswill print more dialect accurate. Maybe add symbol
+  chars and identifier chars to the dialect definition and use them from
+  here
+
+start adding negative / different parse dialect tests
+
+add token tables and tests for oracle, sql server
+review existing tables
+
+look for refactoring opportunities, especially the token
+generation tables in the tests
+
+do some user documentation on lexing, and lexing/dialects
+
+start thinking about a more separated design for the dialect handling
+
+lexing tests are starting to take a really long time, so split the
+tests so it is much easier to run all the tests except the lexing
+tests which only need to be run when working on the lexer (which
+should be relatively uncommon), or doing a commit or finishing off a
+series of commits,
+
+start writing the error message tests:
+  generate/write a large number of syntax errors
+  create a table with the source and the error message
+  try to compare some different versions of code to compare the
+    quality of the error messages by hand
+
+  get this checked in so improvements and regressions in the error
+    message quality can be tracked a little more easily (although it will
+    still be manual)
+
+try again to add annotation to the ast
 -}

 -- | Lexer for SQL.
@ -49,6 +85,9 @@ import Prelude hiding (takeWhile)
 import Text.Parsec.String (Parser)
 import Data.Maybe

+------------------------------------------------------------------------------
+
+-- syntax

 -- | Represents a lexed token
 data Token
@ -58,63 +97,37 @@ data Token
    -- * single char symbols: * + -  < >  ^ / %  ~ & | ? ( ) [ ] , ; ( )
    --
    = Symbol String
-
    -- | This is an identifier or keyword. The first field is
    -- the quotes used, or nothing if no quotes were used. The quotes
    -- can be " or u& or something dialect specific like []
    | Identifier (Maybe (String,String)) String
-
    -- | This is a prefixed variable symbol, such as :var, @var or #var
    -- (only :var is used in ansi dialect)
    | PrefixedVariable Char String
-
    -- | This is a positional arg identifier e.g. $1
    | PositionalArg Int
-
    -- | This is a string literal. The first two fields are the --
    -- start and end quotes, which are usually both ', but can be
    -- the character set (one of nNbBxX, or u&, U&), or a dialect
    -- specific string quoting (such as $$ in postgres)
    | SqlString String String String
-
    -- | A number literal (integral or otherwise), stored in original format
    -- unchanged
    | SqlNumber String
-
    -- | Whitespace, one or more of space, tab or newline.
    | Whitespace String
-
    -- | A commented line using --, contains every character starting with the
    -- \'--\' and including the terminating newline character if there is one
    -- - this will be missing if the last line in the source is a line comment
    -- with no trailing newline
    | LineComment String
-
    -- | A block comment, \/* stuff *\/, includes the comment delimiters
    | BlockComment String
-
      deriving (Eq,Show)

+------------------------------------------------------------------------------

-
-- | Pretty printing, if you lex a bunch of tokens, then pretty
-- print them, should should get back exactly the same string
-prettyToken :: Dialect -> Token -> String
-prettyToken _ (Symbol s) = s
-prettyToken _ (Identifier Nothing t) = t
-prettyToken _ (Identifier (Just (q1,q2)) t) = q1 ++ t ++ q2
-prettyToken _ (PrefixedVariable c p) = c:p
-prettyToken _ (PositionalArg p) = '$':show p
-prettyToken _ (SqlString s e t) = s ++ t ++ e
-prettyToken _ (SqlNumber r) = r
-prettyToken _ (Whitespace t) = t
-prettyToken _ (LineComment l) = l
-prettyToken _ (BlockComment c) = c
-
-prettyTokens :: Dialect -> [Token] -> String
-prettyTokens d ts = concat $ map (prettyToken d) ts
-
-- TODO: try to make all parsers applicative only
+-- main api functions

 -- | Lex some SQL to a list of tokens.
 lexSQL :: Dialect
@ -138,6 +151,31 @@ lexSQL dialect fn' p src =
               . flip setSourceColumn c) getPosition
          >>= setPosition

+------------------------------------------------------------------------------
+
+-- pretty printing
+
+-- | Pretty printing, if you lex a bunch of tokens, then pretty
+-- print them, should should get back exactly the same string
+prettyToken :: Dialect -> Token -> String
+prettyToken _ (Symbol s) = s
+prettyToken _ (Identifier Nothing t) = t
+prettyToken _ (Identifier (Just (q1,q2)) t) = q1 ++ t ++ q2
+prettyToken _ (PrefixedVariable c p) = c:p
+prettyToken _ (PositionalArg p) = '$':show p
+prettyToken _ (SqlString s e t) = s ++ t ++ e
+prettyToken _ (SqlNumber r) = r
+prettyToken _ (Whitespace t) = t
+prettyToken _ (LineComment l) = l
+prettyToken _ (BlockComment c) = c
+
+prettyTokens :: Dialect -> [Token] -> String
+prettyTokens d ts = concat $ map (prettyToken d) ts
+
+------------------------------------------------------------------------------
+
+-- token parsers
+
 -- | parser for a sql token
 sqlToken :: Dialect -> Parser ((String,Int,Int),Token)
 sqlToken d = do
@ -164,77 +202,7 @@ this is also tried before symbol (a .1 will be parsed as a number, but
                    ,symbol d
                    ,sqlWhitespace d]

-{-
-Parses identifiers:
-
-simple_identifier_23
-u&"unicode quoted identifier"
-"quoted identifier"
-"quoted identifier "" with double quote char"
-`mysql quoted identifier`
-}
-
-identifier :: Dialect -> Parser Token
-identifier d =
-    choice
-    [quotedIden
-    ,unicodeQuotedIden
-    ,regularIden
-    ,guard (diBackquotedIden d) >> mySqlQuotedIden
-    ,guard (diSquareBracketQuotedIden d) >> sqlServerQuotedIden
-    ]
-  where
-    regularIden = Identifier Nothing <$> identifierString
-    quotedIden = Identifier (Just ("\"","\"")) <$> qidenPart
-    mySqlQuotedIden = Identifier (Just ("`","`"))
-                      <$> (char '`' *> takeWhile1 (/='`') <* char '`')
-    sqlServerQuotedIden = Identifier (Just ("[","]"))
-                          <$> (char '[' *> takeWhile1 (`notElem` "[]") <* char ']')
-    -- try is used here to avoid a conflict with identifiers
-    -- and quoted strings which also start with a 'u'
-    unicodeQuotedIden = Identifier
-                        <$> (f <$> try (oneOf "uU" <* string "&"))
-                        <*> qidenPart
-      where f x = Just (x: "&\"", "\"")
-    qidenPart = char '"' *> qidenSuffix ""
-    qidenSuffix t = do
-        s <- takeTill (=='"')
-        void $ char '"'
-        -- deal with "" as literal double quote character
-        choice [do
-                void $ char '"'
-                qidenSuffix $ concat [t,s,"\"\""]
-               ,return $ concat [t,s]]
-
-
-- This parses a valid identifier without quotes.
-
-identifierString :: Parser String
-identifierString =
-    startsWith (\c -> c == '_' || isAlpha c) isIdentifierChar
-
-- this can be moved to the dialect at some point
-
-isIdentifierChar :: Char -> Bool
-isIdentifierChar c = c == '_' || isAlphaNum c
-
-- use try because : and @ can be part of other things also
-
-prefixedVariable :: Dialect -> Parser Token
-prefixedVariable  d = try $ choice
-    [PrefixedVariable <$> char ':' <*> identifierString
-    ,guard (diAtIdentifier d) >>
-     PrefixedVariable <$> char '@' <*> identifierString
-    ,guard (diHashIdentifier d) >>
-     PrefixedVariable <$> char '#' <*> identifierString
-    ]
-
-positionalArg :: Dialect -> Parser Token
-positionalArg d =
-  guard (diPositionalArg d) >>
-  -- use try to avoid ambiguities with other syntax which starts with dollar
-  PositionalArg <$> try (char '$' *> (read <$> many1 digit))
-
+--------------------------------------

 {-
 Parse a SQL string. Examples:
@ -292,6 +260,125 @@ sqlString d = dollarString <|> csString <|> normalString
                  ++ [string "u&'"
                     ,string "U&'"]

+
+--------------------------------------
+
+{-
+Parses identifiers:
+
+simple_identifier_23
+u&"unicode quoted identifier"
+"quoted identifier"
+"quoted identifier "" with double quote char"
+`mysql quoted identifier`
+-}
+
+identifier :: Dialect -> Parser Token
+identifier d =
+    choice
+    [quotedIden
+    ,unicodeQuotedIden
+    ,regularIden
+    ,guard (diBackquotedIden d) >> mySqlQuotedIden
+    ,guard (diSquareBracketQuotedIden d) >> sqlServerQuotedIden
+    ]
+  where
+    regularIden = Identifier Nothing <$> identifierString
+    quotedIden = Identifier (Just ("\"","\"")) <$> qidenPart
+    mySqlQuotedIden = Identifier (Just ("`","`"))
+                      <$> (char '`' *> takeWhile1 (/='`') <* char '`')
+    sqlServerQuotedIden = Identifier (Just ("[","]"))
+                          <$> (char '[' *> takeWhile1 (`notElem` "[]") <* char ']')
+    -- try is used here to avoid a conflict with identifiers
+    -- and quoted strings which also start with a 'u'
+    unicodeQuotedIden = Identifier
+                        <$> (f <$> try (oneOf "uU" <* string "&"))
+                        <*> qidenPart
+      where f x = Just (x: "&\"", "\"")
+    qidenPart = char '"' *> qidenSuffix ""
+    qidenSuffix t = do
+        s <- takeTill (=='"')
+        void $ char '"'
+        -- deal with "" as literal double quote character
+        choice [do
+                void $ char '"'
+                qidenSuffix $ concat [t,s,"\"\""]
+               ,return $ concat [t,s]]
+
+
+-- This parses a valid identifier without quotes.
+
+identifierString :: Parser String
+identifierString =
+    startsWith (\c -> c == '_' || isAlpha c) isIdentifierChar
+
+-- this can be moved to the dialect at some point
+
+isIdentifierChar :: Char -> Bool
+isIdentifierChar c = c == '_' || isAlphaNum c
+
+--------------------------------------
+
+lineComment :: Dialect -> Parser Token
+lineComment _ =
+    (\s -> LineComment $ concat ["--",s]) <$>
+    -- try is used here in case we see a - symbol
+    -- once we read two -- then we commit to the comment token
+    (try (string "--") *> (
+        -- todo: there must be a better way to do this
+     conc <$> manyTill anyChar (lookAhead lineCommentEnd) <*> lineCommentEnd))
+  where
+    conc a Nothing = a
+    conc a (Just b) = a ++ b
+    lineCommentEnd =
+        Just "\n" <$ char '\n'
+        <|> Nothing <$ eof
+
+--------------------------------------
+
+{-
+Try is used in the block comment for the two symbol bits because we
+want to backtrack if we read the first symbol but the second symbol
+isn't there.
+-}
+
+blockComment :: Dialect -> Parser Token
+blockComment _ =
+    (\s -> BlockComment $ concat ["/*",s]) <$>
+    (try (string "/*") *> commentSuffix 0)
+  where
+    commentSuffix :: Int -> Parser String
+    commentSuffix n = do
+      -- read until a possible end comment or nested comment
+      x <- takeWhile (\e -> e /= '/' && e /= '*')
+      choice [-- close comment: if the nesting is 0, done
+              -- otherwise recurse on commentSuffix
+              try (string "*/") *> let t = concat [x,"*/"]
+                                   in if n == 0
+                                      then return t
+                                      else (\s -> concat [t,s]) <$> commentSuffix (n - 1)
+              -- nested comment, recurse
+             ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
+              -- not an end comment or nested comment, continue
+             ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
+
+
+{-
+This is to improve user experience: provide an error if we see */
+outside a comment. This could potentially break postgres ops with */
+in them (which is a stupid thing to do). In other cases, the user
+should write * / instead (I can't think of any cases when this would
+be valid syntax though).
+-}
+
+dontParseEndBlockComment :: Dialect -> Parser Token
+dontParseEndBlockComment _ =
+    -- don't use try, then it should commit to the error
+    try (string "*/") *> fail "comment end without comment start"
+
+--------------------------------------
+
+
 {-
 numbers

@ -341,6 +428,30 @@ sqlNumber d =
    sInt = (++) <$> option "" (string "+" <|> string "-") <*> int
    pp = (<$$> (++))

+--------------------------------------
+
+positionalArg :: Dialect -> Parser Token
+positionalArg d =
+  guard (diPositionalArg d) >>
+  -- use try to avoid ambiguities with other syntax which starts with dollar
+  PositionalArg <$> try (char '$' *> (read <$> many1 digit))
+
+
+--------------------------------------
+
+-- use try because : and @ can be part of other things also
+
+prefixedVariable :: Dialect -> Parser Token
+prefixedVariable  d = try $ choice
+    [PrefixedVariable <$> char ':' <*> identifierString
+    ,guard (diAtIdentifier d) >>
+     PrefixedVariable <$> char '@' <*> identifierString
+    ,guard (diHashIdentifier d) >>
+     PrefixedVariable <$> char '#' <*> identifierString
+    ]
+
+--------------------------------------
+
 {-
 Symbols

@ -466,66 +577,14 @@ generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
        <*> option [] opMoreChars
       ]

+--------------------------------------
+
 sqlWhitespace :: Dialect -> Parser Token
 sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)

-lineComment :: Dialect -> Parser Token
-lineComment _ =
-    (\s -> LineComment $ concat ["--",s]) <$>
-    -- try is used here in case we see a - symbol
-    -- once we read two -- then we commit to the comment token
-    (try (string "--") *> (
-        -- todo: there must be a better way to do this
-     conc <$> manyTill anyChar (lookAhead lineCommentEnd) <*> lineCommentEnd))
-  where
-    conc a Nothing = a
-    conc a (Just b) = a ++ b
-    lineCommentEnd =
-        Just "\n" <$ char '\n'
-        <|> Nothing <$ eof
+----------------------------------------------------------------------------

-{-
-Try is used in the block comment for the two symbol bits because we
-want to backtrack if we read the first symbol but the second symbol
-isn't there.
-}
-
-blockComment :: Dialect -> Parser Token
-blockComment _ =
-    (\s -> BlockComment $ concat ["/*",s]) <$>
-    (try (string "/*") *> commentSuffix 0)
-  where
-    commentSuffix :: Int -> Parser String
-    commentSuffix n = do
-      -- read until a possible end comment or nested comment
-      x <- takeWhile (\e -> e /= '/' && e /= '*')
-      choice [-- close comment: if the nesting is 0, done
-              -- otherwise recurse on commentSuffix
-              try (string "*/") *> let t = concat [x,"*/"]
-                                   in if n == 0
-                                      then return t
-                                      else (\s -> concat [t,s]) <$> commentSuffix (n - 1)
-              -- nested comment, recurse
-             ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
-              -- not an end comment or nested comment, continue
-             ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
-
-
-{-
-This is to improve user experience: provide an error if we see */
-outside a comment. This could potentially break postgres ops with */
-in them (which is a stupid thing to do). In other cases, the user
-should write * / instead (I can't think of any cases when this would
-be valid syntax though).
-}
-
-dontParseEndBlockComment :: Dialect -> Parser Token
-dontParseEndBlockComment _ =
-    -- don't use try, then it should commit to the error
-    try (string "*/") *> fail "comment end without comment start"
-
-
-- Some helper combinators
+-- parser helpers

 startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
 startsWith p ps = do
@ -545,6 +604,9 @@ takeTill p = manyTill anyChar (peekSatisfy p)
 peekSatisfy :: (Char -> Bool) -> Parser ()
 peekSatisfy p = void $ lookAhead (satisfy p)

+----------------------------------------------------------------------------
+
+
 {-
 This utility function will accurately report if the two tokens are
 pretty printed, if they should lex back to the same two tokens. This
@ -708,44 +770,3 @@ TODO: not 100% on this always being bad
    checkLastAChar f = case prettya of
                          (_:_) -> f $ last prettya
                          _ -> False
-
-
-
-
-{-
-TODO:
-
-make the tokenswill print more dialect accurate. Maybe add symbol
-  chars and identifier chars to the dialect definition and use them from
-  here
-
-start adding negative / different parse dialect tests
-
-add token tables and tests for oracle, sql server
-review existing tables
-
-look for refactoring opportunities, especially the token
-generation tables in the tests
-
-do some user documentation on lexing, and lexing/dialects
-
-start thinking about a more separated design for the dialect handling
-
-lexing tests are starting to take a really long time, so split the
-tests so it is much easier to run all the tests except the lexing
-tests which only need to be run when working on the lexer (which
-should be relatively uncommon), or doing a commit or finishing off a
-series of commits,
-
-start writing the error message tests:
-  generate/write a large number of syntax errors
-  create a table with the source and the error message
-  try to compare some different versions of code to compare the
-    quality of the error messages by hand
-
-  get this checked in so improvements and regressions in the error
-    message quality can be tracked a little more easily (although it will
-    still be manual)
-
-try again to add annotation to the ast
-}