I want to use xml-conduit, specifically Text.XML.Stream.Parse in order to lazily extract a list of objects from a large XML file.
As a test case, I use the recently
Based on Michael Snoyman's excellent answer here is a modified version that reads the data from stackoverflow.com-Users.7z
instead of sourcing it from an artificially generated IO stream.
For a reference on how to use xml-conduit
directly, please see Michael's answer. This answer is only provided as an example on how to use the method described there on optionally compressed files.
The main change here is that you need to use runResourceT
to read the file, and the final print
needs to be lift
ed from IO ()
to ResourceT IO ()
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
import qualified Data.Conduit.Binary as CB
import Control.Applicative ((<*))
import Control.Concurrent (threadDelay)
import Control.Monad (forever, void)
import Control.Monad.IO.Class (MonadIO (liftIO))
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as LB
import Data.Conduit
import qualified Data.Conduit.List as CL
import Data.Text (Text)
import Data.Text.Encoding (encodeUtf8)
import Data.XML.Types (Event)
import Text.XML.Stream.Parse
import Data.Conduit.BZlib (bunzip2)
import Control.Monad.Trans.Class (lift)
import Control.Monad.Trans.Resource (MonadThrow, runResourceT)
data User = User {name :: Text} deriving (Show)
parseUserRow :: MonadThrow m => Consumer Event m (Maybe User)
parseUserRow = tagName "row" (requireAttr "DisplayName" <* ignoreAttrs) $ \displayName -> do
return $ User displayName
parseUsers :: MonadThrow m => Conduit Event m User
parseUsers = void $ tagNoAttr "users" $ yieldWhileJust parseUserRow
yieldWhileJust :: Monad m
=> ConduitM a b m (Maybe b)
-> Conduit a m b
yieldWhileJust consumer =
loop
where
loop = do
mx <- consumer
case mx of
Nothing -> return ()
Just x -> yield x >> loop
main :: IO ()
main = runResourceT $ CB.sourceFile "stackoverflow.com-Users.7z" $= bunzip2 $$ parseBytes def
=$ parseUsers
=$ CL.mapM_ (lift . print)