1
Fork 0

move the megaparsec stream stuff to the lexer, restore the old lex return without source positions, add a new lex function that does return source positions

This commit is contained in:
Jake Wheat 2024-01-10 11:57:13 +00:00
parent e76aa2818b
commit de121d1fd6
4 changed files with 132 additions and 117 deletions

View file

@ -73,16 +73,21 @@ try again to add annotation to the ast
-- | Lexer for SQL.
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TypeFamilies #-}
module Language.SQL.SimpleSQL.Lex
(Token(..)
,WithPos(..)
,lexSQL
,lexSQLWithPositions
,prettyToken
,prettyTokens
,ParseError
,prettyError
,tokenListWillPrintAndLex
,ansi2011
,MyStream(..)
) where
import Language.SQL.SimpleSQL.Dialect
@ -94,6 +99,10 @@ import Text.Megaparsec
(Parsec
,runParser'
,PosState(..)
,TraversableStream(..)
,VisualStream(..)
,ParseErrorBundle(..)
,errorBundlePretty
@ -125,10 +134,13 @@ import Text.Megaparsec.Char
,char
)
import Text.Megaparsec.State (initialState)
import Control.Applicative ((<**>))
import qualified Data.List as DL
import qualified Data.List.NonEmpty as NE
import Data.Proxy (Proxy(..))
import Data.Void (Void)
import Control.Applicative ((<**>))
import Data.Char
(isAlphaNum
,isAlpha
@ -184,17 +196,34 @@ data Token
-- main api functions
-- | Lex some SQL to a list of tokens.
lexSQL :: Dialect
-- ^ dialect of SQL to use
-> Text
-- ^ filename to use in error messages
-> Maybe (Int,Int)
-- ^ line number and column number of the first character
-- in the source to use in error messages
-> Text
-- ^ the SQL source to lex
-> Either ParseError [WithPos Token]
lexSQL dialect fn p src = myParse fn p (many (sqlToken dialect) <* (eof <?> "")) src
lexSQLWithPositions
:: Dialect
-- ^ dialect of SQL to use
-> Text
-- ^ filename to use in error messages
-> Maybe (Int,Int)
-- ^ line number and column number of the first character
-- in the source to use in error messages
-> Text
-- ^ the SQL source to lex
-> Either ParseError [WithPos Token]
lexSQLWithPositions dialect fn p src = myParse fn p (many (sqlToken dialect) <* (eof <?> "")) src
-- | Lex some SQL to a list of tokens.
lexSQL
:: Dialect
-- ^ dialect of SQL to use
-> Text
-- ^ filename to use in error messages
-> Maybe (Int,Int)
-- ^ line number and column number of the first character
-- in the source to use in error messages
-> Text
-- ^ the SQL source to lex
-> Either ParseError [Token]
lexSQL dialect fn p src =
fmap (map tokenVal) $ lexSQLWithPositions dialect fn p src
myParse :: Text -> (Maybe (Int,Int)) -> Parser a -> Text -> Either ParseError a
myParse name sp' p s =
@ -823,3 +852,88 @@ TODO: not 100% on this always being bad
checkLastAChar f = case T.unsnoc prettya of
Just (_,la) -> f la
_ -> False
------------------------------------------------------------------------------
-- megaparsec stream boilerplate
data MyStream = MyStream
{ myStreamInput :: String
, unMyStream :: [WithPos Token]
}
instance M.Stream MyStream where
type Token MyStream = WithPos Token
type Tokens MyStream = [WithPos Token]
tokenToChunk Proxy x = [x]
tokensToChunk Proxy xs = xs
chunkToTokens Proxy = id
chunkLength Proxy = length
chunkEmpty Proxy = null
take1_ (MyStream _ []) = Nothing
take1_ (MyStream str (t:ts)) = Just
( t
, MyStream (drop (tokensLength pxy (t NE.:|[])) str) ts
)
takeN_ n (MyStream str s)
| n <= 0 = Just ([], MyStream str s)
| null s = Nothing
| otherwise =
let (x, s') = splitAt n s
in case NE.nonEmpty x of
Nothing -> Just (x, MyStream str s')
Just nex -> Just (x, MyStream (drop (tokensLength pxy nex) str) s')
takeWhile_ f (MyStream str s) =
let (x, s') = DL.span f s
in case NE.nonEmpty x of
Nothing -> (x, MyStream str s')
Just nex -> (x, MyStream (drop (tokensLength pxy nex) str) s')
instance VisualStream MyStream where
showTokens Proxy = DL.intercalate " "
. NE.toList
. fmap (showMyToken . tokenVal)
tokensLength Proxy xs = sum (tokenLength <$> xs)
instance TraversableStream MyStream where
reachOffset o M.PosState {..} =
( Just (prefix ++ restOfLine)
, PosState
{ pstateInput = MyStream
{ myStreamInput = postStr
, unMyStream = post
}
, pstateOffset = max pstateOffset o
, pstateSourcePos = newSourcePos
, pstateTabWidth = pstateTabWidth
, pstateLinePrefix = prefix
}
)
where
prefix =
if sameLine
then pstateLinePrefix ++ preLine
else preLine
sameLine = sourceLine newSourcePos == sourceLine pstateSourcePos
newSourcePos =
case post of
[] -> case unMyStream pstateInput of
[] -> pstateSourcePos
xs -> endPos (last xs)
(x:_) -> startPos x
(pre, post) = splitAt (o - pstateOffset) (unMyStream pstateInput)
(preStr, postStr) = splitAt tokensConsumed (myStreamInput pstateInput)
preLine = reverse . takeWhile (/= '\n') . reverse $ preStr
tokensConsumed =
case NE.nonEmpty pre of
Nothing -> 0
Just nePre -> tokensLength pxy nePre
restOfLine = takeWhile (/= '\n') postStr
pxy :: Proxy MyStream
pxy = Proxy
showMyToken :: Token -> String
-- todo: how to do this properly?
showMyToken = T.unpack . prettyToken ansi2011

View file

@ -177,10 +177,7 @@ fixing them in the syntax but leaving them till the semantic checking
-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TypeFamilies #-}
-- | This is the module with the parser functions.
module Language.SQL.SimpleSQL.Parse
(parseQueryExpr
@ -195,14 +192,6 @@ import Text.Megaparsec
(ParsecT
,runParserT
,Stream(..)
,PosState(..)
,TraversableStream(..)
,VisualStream(..)
--,ErrorItem(Tokens)
,sourceLine
,ParseErrorBundle(..)
,errorBundlePretty
@ -229,10 +218,7 @@ import Control.Monad.Reader
,ask
)
import qualified Data.List as DL
import qualified Data.List.NonEmpty as NE
import qualified Data.Set as Set
import Data.Proxy (Proxy(..))
import Data.Void (Void)
import Control.Monad (guard, void)
@ -315,7 +301,7 @@ parseScalarExpr = wrapParse scalarExpr
data ParseError
= LexError L.ParseError
| ParseError (ParseErrorBundle MyStream Void)
| ParseError (ParseErrorBundle L.MyStream Void)
prettyError :: ParseError -> Text
prettyError (LexError e) = T.pack $ errorBundlePretty e
@ -337,10 +323,10 @@ wrapParse :: Parser a
-> Text
-> Either ParseError a
wrapParse parser d f p src = do
lx <- either (Left . LexError) Right $ L.lexSQL d f p src
lx <- either (Left . LexError) Right $ L.lexSQLWithPositions d f p src
either (Left . ParseError) Right $
runReader (runParserT (parser <* (eof <?> "")) (T.unpack f)
$ MyStream (T.unpack src) $ filter notSpace lx) d
$ L.MyStream (T.unpack src) $ filter notSpace lx) d
where
notSpace = notSpace' . L.tokenVal
notSpace' (L.Whitespace {}) = False
@ -352,7 +338,7 @@ wrapParse parser d f p src = do
-- parsing code
type Parser = ParsecT Void MyStream (Reader Dialect)
type Parser = ParsecT Void L.MyStream (Reader Dialect)
{-
------------------------------------------------
@ -2384,87 +2370,3 @@ queryDialect f = do
d <- ask
pure $ f d
------------------------------------------------------------------------------
-- parsec stream boilerplate
data MyStream = MyStream
{ myStreamInput :: String
, unMyStream :: [L.WithPos L.Token]
}
instance Stream MyStream where
type Token MyStream = L.WithPos L.Token
type Tokens MyStream = [L.WithPos L.Token]
tokenToChunk Proxy x = [x]
tokensToChunk Proxy xs = xs
chunkToTokens Proxy = id
chunkLength Proxy = length
chunkEmpty Proxy = null
take1_ (MyStream _ []) = Nothing
take1_ (MyStream str (t:ts)) = Just
( t
, MyStream (drop (tokensLength pxy (t NE.:|[])) str) ts
)
takeN_ n (MyStream str s)
| n <= 0 = Just ([], MyStream str s)
| null s = Nothing
| otherwise =
let (x, s') = splitAt n s
in case NE.nonEmpty x of
Nothing -> Just (x, MyStream str s')
Just nex -> Just (x, MyStream (drop (tokensLength pxy nex) str) s')
takeWhile_ f (MyStream str s) =
let (x, s') = DL.span f s
in case NE.nonEmpty x of
Nothing -> (x, MyStream str s')
Just nex -> (x, MyStream (drop (tokensLength pxy nex) str) s')
instance VisualStream MyStream where
showTokens Proxy = DL.intercalate " "
. NE.toList
. fmap (showMyToken . L.tokenVal)
tokensLength Proxy xs = sum (L.tokenLength <$> xs)
instance TraversableStream MyStream where
reachOffset o PosState {..} =
( Just (prefix ++ restOfLine)
, PosState
{ pstateInput = MyStream
{ myStreamInput = postStr
, unMyStream = post
}
, pstateOffset = max pstateOffset o
, pstateSourcePos = newSourcePos
, pstateTabWidth = pstateTabWidth
, pstateLinePrefix = prefix
}
)
where
prefix =
if sameLine
then pstateLinePrefix ++ preLine
else preLine
sameLine = sourceLine newSourcePos == sourceLine pstateSourcePos
newSourcePos =
case post of
[] -> case unMyStream pstateInput of
[] -> pstateSourcePos
xs -> L.endPos (last xs)
(x:_) -> L.startPos x
(pre, post) = splitAt (o - pstateOffset) (unMyStream pstateInput)
(preStr, postStr) = splitAt tokensConsumed (myStreamInput pstateInput)
preLine = reverse . takeWhile (/= '\n') . reverse $ preStr
tokensConsumed =
case NE.nonEmpty pre of
Nothing -> 0
Just nePre -> tokensLength pxy nePre
restOfLine = takeWhile (/= '\n') postStr
pxy :: Proxy MyStream
pxy = Proxy
showMyToken :: L.Token -> String
-- todo: how to do this properly?
showMyToken = T.unpack . L.prettyToken ansi2011

View file

@ -82,7 +82,7 @@ bootstrapTests = Group "bootstrap tests" $
] ++ map (\a -> (a, [Symbol a])) (
["!=", "<>", ">=", "<=", "||"]
++ map T.singleton ("(),-+*/<>=." :: String)))
++ map T.singleton ("(),-+*/<>=." :: [Char])))
ansiLexerTable :: [(Text,[Token])]

View file

@ -110,8 +110,7 @@ itemToTest (LexFails d s) = makeLexingFailsTest d s
makeLexerTest :: Dialect -> Text -> [Lex.Token] -> T.TestTree
makeLexerTest d s ts = H.testCase (T.unpack s) $ do
let lx = either (error . T.unpack . Lex.prettyError) id $ Lex.lexSQL d "" Nothing s
ts1 = map Lex.tokenVal lx
let ts1 = either (error . T.unpack . Lex.prettyError) id $ Lex.lexSQL d "" Nothing s
H.assertEqual "" ts ts1
let s' = Lex.prettyTokens d $ ts1
H.assertEqual "pretty print" s s'