{-# LANGUAGE OverloadedStrings #-}
module Text.HTML.DOM
( eventConduit
, sinkDoc
, readFile
, parseLBS
, parseBSChunks
, eventConduitText
, sinkDocText
, parseLT
, parseSTChunks
) where
import Control.Monad.Trans.Resource
import Prelude hiding (readFile)
import qualified Data.ByteString as S
import qualified Text.HTML.TagStream as TS
import qualified Data.XML.Types as XT
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Conduit.List as CL
import Control.Arrow ((***))
import qualified Data.Set as Set
import qualified Text.XML as X
import Conduit
import qualified Data.ByteString.Lazy as L
import Data.Maybe (mapMaybe)
import qualified Data.Map.Strict as Map
eventConduit :: Monad m => ConduitT S.ByteString XT.Event m ()
eventConduit :: forall (m :: * -> *). Monad m => ConduitT ByteString Event m ()
eventConduit = forall (m :: * -> *). Monad m => ConduitT ByteString Text m ()
decodeUtf8LenientC forall (m :: * -> *) a b c r.
Monad m =>
ConduitT a b m () -> ConduitT b c m r -> ConduitT a c m r
.| forall (m :: * -> *). Monad m => ConduitT Text Event m ()
eventConduit'
eventConduitText :: Monad m => ConduitT T.Text XT.Event m ()
eventConduitText :: forall (m :: * -> *). Monad m => ConduitT Text Event m ()
eventConduitText = forall (m :: * -> *). Monad m => ConduitT Text Event m ()
eventConduit'
eventConduit' :: Monad m => ConduitT T.Text XT.Event m ()
eventConduit' :: forall (m :: * -> *). Monad m => ConduitT Text Event m ()
eventConduit' =
forall (m :: * -> *). Monad m => ConduitT Text Token m ()
TS.tokenStream forall (m :: * -> *) a b c r.
Monad m =>
ConduitT a b m () -> ConduitT b c m r -> ConduitT a c m r
.| [Name] -> ConduitT Token Event m ()
go []
where
go :: [Name] -> ConduitT Token Event m ()
go [Name]
stack = do
Maybe Token
mx <- forall (m :: * -> *) i o. Monad m => ConduitT i o m (Maybe i)
await
case Maybe Token
mx of
Maybe Token
Nothing -> forall {i}. [Name] -> ConduitT i Event m ()
closeStack [Name]
stack
Just (TS.TagOpen Text
local Map Text Text
_ Bool
_) | Text
"?" Text -> Text -> Bool
`T.isPrefixOf` Text
local -> [Name] -> ConduitT Token Event m ()
go [Name]
stack
Just (TS.TagOpen Text
local Map Text Text
attrs Bool
isClosed) -> do
let name :: Name
name = Text -> Name
toName Text
local
attrs' :: [(Name, [Content])]
attrs' = forall a b. (a -> b) -> [a] -> [b]
map (Text -> Name
toName forall (a :: * -> * -> *) b c b' c'.
Arrow a =>
a b c -> a b' c' -> a (b, b') (c, c')
*** forall (m :: * -> *) a. Monad m => a -> m a
return forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Content
XT.ContentText) forall a b. (a -> b) -> a -> b
$ forall k a. Map k a -> [(k, a)]
Map.toList Map Text Text
attrs
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield forall a b. (a -> b) -> a -> b
$ Name -> [(Name, [Content])] -> Event
XT.EventBeginElement Name
name [(Name, [Content])]
attrs'
if Bool
isClosed Bool -> Bool -> Bool
|| Text -> Bool
isVoid Text
local
then forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Name -> Event
XT.EventEndElement Name
name) forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> [Name] -> ConduitT Token Event m ()
go [Name]
stack
else [Name] -> ConduitT Token Event m ()
go forall a b. (a -> b) -> a -> b
$ Name
name forall a. a -> [a] -> [a]
: [Name]
stack
Just (TS.TagClose Text
name)
| Text -> Name
toName Text
name forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Name]
stack ->
let loop :: [Name] -> ConduitT Token Event m ()
loop [] = [Name] -> ConduitT Token Event m ()
go []
loop (Name
n:[Name]
ns) = do
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield forall a b. (a -> b) -> a -> b
$ Name -> Event
XT.EventEndElement Name
n
if Name
n forall a. Eq a => a -> a -> Bool
== Text -> Name
toName Text
name
then [Name] -> ConduitT Token Event m ()
go [Name]
ns
else [Name] -> ConduitT Token Event m ()
loop [Name]
ns
in [Name] -> ConduitT Token Event m ()
loop [Name]
stack
| Bool
otherwise -> [Name] -> ConduitT Token Event m ()
go [Name]
stack
Just (TS.Text Text
t) -> do
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield forall a b. (a -> b) -> a -> b
$ Content -> Event
XT.EventContent forall a b. (a -> b) -> a -> b
$ Text -> Content
XT.ContentText Text
t
[Name] -> ConduitT Token Event m ()
go [Name]
stack
Just (TS.Comment Text
t) -> do
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield forall a b. (a -> b) -> a -> b
$ Text -> Event
XT.EventComment Text
t
[Name] -> ConduitT Token Event m ()
go [Name]
stack
Just TS.Special{} -> [Name] -> ConduitT Token Event m ()
go [Name]
stack
Just TS.Incomplete{} -> [Name] -> ConduitT Token Event m ()
go [Name]
stack
toName :: Text -> Name
toName Text
l = Text -> Maybe Text -> Maybe Text -> Name
XT.Name Text
l forall a. Maybe a
Nothing forall a. Maybe a
Nothing
closeStack :: [Name] -> ConduitT i Event m ()
closeStack = forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield forall b c a. (b -> c) -> (a -> b) -> a -> c
. Name -> Event
XT.EventEndElement)
isVoid :: Text -> Bool
isVoid Text
name = forall a. Ord a => a -> Set a -> Bool
Set.member (Text -> Text
T.toLower Text
name) Set Text
voidSet
voidSet :: Set.Set T.Text
voidSet :: Set Text
voidSet = forall a. Ord a => [a] -> Set a
Set.fromList
[ Text
"area"
, Text
"base"
, Text
"br"
, Text
"col"
, Text
"command"
, Text
"embed"
, Text
"hr"
, Text
"img"
, Text
"input"
, Text
"keygen"
, Text
"link"
, Text
"meta"
, Text
"param"
, Text
"source"
, Text
"track"
, Text
"wbr"
]
sinkDoc :: MonadThrow m => ConduitT S.ByteString o m X.Document
sinkDoc :: forall (m :: * -> *) o.
MonadThrow m =>
ConduitT ByteString o m Document
sinkDoc = forall (m :: * -> *) a o.
MonadThrow m =>
ConduitT a Event m () -> ConduitT a o m Document
sinkDoc' forall (m :: * -> *). Monad m => ConduitT ByteString Event m ()
eventConduit
sinkDocText :: MonadThrow m => ConduitT T.Text o m X.Document
sinkDocText :: forall (m :: * -> *) o. MonadThrow m => ConduitT Text o m Document
sinkDocText = forall (m :: * -> *) a o.
MonadThrow m =>
ConduitT a Event m () -> ConduitT a o m Document
sinkDoc' forall (m :: * -> *). Monad m => ConduitT Text Event m ()
eventConduitText
sinkDoc'
:: MonadThrow m
=> ConduitT a XT.Event m ()
-> ConduitT a o m X.Document
sinkDoc' :: forall (m :: * -> *) a o.
MonadThrow m =>
ConduitT a Event m () -> ConduitT a o m Document
sinkDoc' ConduitT a Event m ()
f =
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Document -> Document
stripDummy forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *) o1 o2 i r.
Monad m =>
(o1 -> o2) -> ConduitT i o1 m r -> ConduitT i o2 m r
mapOutput ((,) forall a. Maybe a
Nothing) ConduitT a Event m ()
f forall (m :: * -> *) a b c r.
Monad m =>
ConduitT a b m () -> ConduitT b c m r -> ConduitT a c m r
.| forall {a}. ConduitT (Maybe a, Event) (Maybe a, Event) m ()
addDummyWrapper forall (m :: * -> *) a b c r.
Monad m =>
ConduitT a b m () -> ConduitT b c m r -> ConduitT a c m r
.| forall (m :: * -> *) o.
MonadThrow m =>
ConduitT (Maybe PositionRange, Event) o m Document
X.fromEvents
where
addDummyWrapper :: ConduitT (Maybe a, Event) (Maybe a, Event) m ()
addDummyWrapper = do
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (forall a. Maybe a
Nothing, Name -> [(Name, [Content])] -> Event
XT.EventBeginElement Name
"html" [])
forall (m :: * -> *) i o r.
Monad m =>
(i -> ConduitT i o m r) -> ConduitT i o m ()
awaitForever forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (forall a. Maybe a
Nothing, Name -> Event
XT.EventEndElement Name
"html")
stripDummy :: Document -> Document
stripDummy doc :: Document
doc@(X.Document Prologue
pro (X.Element Name
_ Map Name Text
_ [Node]
nodes) [Miscellaneous]
epi) =
case forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe Node -> Maybe Element
toElement [Node]
nodes of
[Element
root] -> Prologue -> Element -> [Miscellaneous] -> Document
X.Document Prologue
pro Element
root [Miscellaneous]
epi
[Element]
_ -> Document
doc
toElement :: Node -> Maybe Element
toElement (X.NodeElement Element
e) = forall a. a -> Maybe a
Just Element
e
toElement Node
_ = forall a. Maybe a
Nothing
readFile :: FilePath -> IO X.Document
readFile :: [Char] -> IO Document
readFile [Char]
fp = forall (m :: * -> *) (n :: * -> *) i a.
(MonadUnliftIO m, MonadIO n) =>
[Char] -> (ConduitM i ByteString n () -> m a) -> m a
withSourceFile [Char]
fp forall a b. (a -> b) -> a -> b
$ \ConduitM () ByteString IO ()
src -> forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit forall a b. (a -> b) -> a -> b
$ ConduitM () ByteString IO ()
src forall (m :: * -> *) a b c r.
Monad m =>
ConduitT a b m () -> ConduitT b c m r -> ConduitT a c m r
.| forall (m :: * -> *) o.
MonadThrow m =>
ConduitT ByteString o m Document
sinkDoc
parseLBS :: L.ByteString -> X.Document
parseLBS :: ByteString -> Document
parseLBS = [ByteString] -> Document
parseBSChunks forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [ByteString]
L.toChunks
parseBSChunks :: [S.ByteString] -> X.Document
parseBSChunks :: [ByteString] -> Document
parseBSChunks [ByteString]
tss =
case forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *) a i. Monad m => [a] -> ConduitT i a m ()
CL.sourceList [ByteString]
tss forall (m :: * -> *) a b c r.
Monad m =>
ConduitT a b m () -> ConduitT b c m r -> ConduitT a c m r
.| forall (m :: * -> *) o.
MonadThrow m =>
ConduitT ByteString o m Document
sinkDoc of
Left SomeException
e -> forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"Unexpected exception in parseBSChunks: " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show SomeException
e
Right Document
x -> Document
x
parseLT :: TL.Text -> X.Document
parseLT :: Text -> Document
parseLT = [Text] -> Document
parseSTChunks forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [Text]
TL.toChunks
parseSTChunks :: [T.Text] -> X.Document
parseSTChunks :: [Text] -> Document
parseSTChunks [Text]
tss =
case forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *) a i. Monad m => [a] -> ConduitT i a m ()
CL.sourceList [Text]
tss forall (m :: * -> *) a b c r.
Monad m =>
ConduitT a b m () -> ConduitT b c m r -> ConduitT a c m r
.| forall (m :: * -> *) o. MonadThrow m => ConduitT Text o m Document
sinkDocText of
Left SomeException
e -> forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"Unexpected exception in parseSTChunks: " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show SomeException
e
Right Document
x -> Document
x