diff --git a/conduit-extra/Data/Conduit/Text.hs b/conduit-extra/Data/Conduit/Text.hs index 008d190a1..30d05dc5a 100644 --- a/conduit-extra/Data/Conduit/Text.hs +++ b/conduit-extra/Data/Conduit/Text.hs @@ -25,6 +25,7 @@ module Data.Conduit.Text , iso8859_1 , lines , linesBounded + , split , TextException (..) , takeWhile , dropWhile @@ -131,6 +132,29 @@ linesBounded maxLineLen = awaitText len' $ buf `T.append` text +split :: Monad m => T.Text -> Conduit T.Text m T.Text +split splitText = awaitText T.empty + where + awaitText buf = await >>= maybe (finish buf) (process buf) + + finish buf = unless (T.null buf) (yield buf) + + process buf text = yieldSplits $ buf `T.append` text + + yieldSplits text = do + let splits = T.splitOn splitText text + lastSplit = lastDef T.empty splits + mapM_ yield (initSafe splits) + awaitText lastSplit + + lastDef :: a -> [a] -> a + lastDef a [] = a + lastDef _ xs = last xs + + initSafe :: [a] -> [a] + initSafe [] = [] + initSafe xs = init xs + -- | Convert text into bytes, using the provided codec. If the codec is -- not capable of representing an input character, an exception will be thrown. diff --git a/conduit-extra/conduit-extra.cabal b/conduit-extra/conduit-extra.cabal index 7b8523fd0..da1527625 100644 --- a/conduit-extra/conduit-extra.cabal +++ b/conduit-extra/conduit-extra.cabal @@ -66,6 +66,7 @@ test-suite test , conduit-extra , base , hspec >= 1.3 + , QuickCheck , async , attoparsec diff --git a/conduit-extra/test/Data/Conduit/TextSpec.hs b/conduit-extra/test/Data/Conduit/TextSpec.hs index 7aba8d310..1e1d50a60 100644 --- a/conduit-extra/test/Data/Conduit/TextSpec.hs +++ b/conduit-extra/test/Data/Conduit/TextSpec.hs @@ -7,6 +7,7 @@ import qualified Data.Conduit.Lift as C import qualified Data.Conduit.List as CL import Test.Hspec import Test.Hspec.QuickCheck +import Test.QuickCheck.Modifiers import Data.Monoid import Control.Monad.ST import qualified Data.Text as T @@ -187,6 +188,44 @@ spec = describe "Data.Conduit.Text" $ do x <- CL.sourceList ["foobarbaz", error "ignore me"] C.$$ CT.decode CT.utf8 C.=$ CL.head x `shouldBe` Just "foobarbaz" + describe "split function" $ do + it "yields nothing when given nothing" $ do + (CL.sourceList [] C.$= CT.split "XX" C.$$ CL.consume) == + [[]] + it "yields nothing given only empty text" $ + (CL.sourceList [""] C.$= CT.split "XX" C.$$ CL.consume) == + [[]] + it "handles separators on a chunk boundary" $ do + (CL.sourceList ["aX","Xb"] C.$= CT.split "XX" C.$$ CL.consume) == + [["a","b"]] + it "works across split chunks" $ + (CL.sourceList ["abc", "dXXef"] C.$= CT.split "XX" C.$$ CL.consume) == + [["abcd", "ef"]] + it "works with multiple splits in an item" $ + (CL.sourceList ["abXXcdXXe"] C.$= CT.split "XX" C.$$ CL.consume) == + [["ab", "cd", "e"]] + it "works with ending on a split" $ + (CL.sourceList ["abXX"] C.$= CT.split "XX" C.$$ CL.consume) == + [["ab"]] + it "works with ending a middle item on a split" $ + (CL.sourceList ["abXX", "cdXXe"] C.$= CT.split "XX" C.$$ CL.consume) == + [["ab", "cd", "e"]] + it "works with empty text" $ + (CL.sourceList ["ab", "", "cd"] C.$= CT.split "XX" C.$$ CL.consume) == + [["abcd"]] + it "works with just separators" $ + (CL.sourceList ["XXXX"] C.$= CT.split "XX" C.$$ CL.consume) == + [["",""]] -- NB: Data.Text.splitOn would have given 3 empty strings. + -- The 2 empty strings behavior is how `Data.Conduit.Text.lines` / `Data.String.lines` works, though + prop "works the same no matter how things are chunked" $ \s1 s2 int -> do + let t1 = T.pack s1 + t2 = T.pack (getNonEmpty s2) + positiveInt = getPositive int + splitUp = T.chunksOf positiveInt t1 + l1 <- (CL.sourceList [t1] C.$= CT.split t2 C.$$ CL.consume) + l2 <- (CL.sourceList splitUp C.$= CT.split t2 C.$$ CL.consume) + l1 `shouldBe` l2 + describe "text lines bounded" $ do it "yields nothing given nothing" $ (CL.sourceList [] C.$= CT.linesBounded 80 C.$$ CL.consume) ==