module Index where
import Data.HashMap.Strict (Map)
import qualified Data.HashMap.Strict as Map
import Data.IntMap.Strict (IntMap)
import qualified Data.IntMap.Strict as IntMap
import Data.IntSet (IntSet)
import qualified Data.IntSet as IntSet
import Data.List (foldl')
import Data.Maybe (catMaybes, fromMaybe)
import Data.Text (Text)
import qualified Analysis as Analysis
import Document (Document)
import qualified Document as Document
newtype TermDoc = TermDoc { unTermDoc :: Map Text IntSet }
deriving Show
instance Semigroup TermDoc where
(TermDoc mA) <> (TermDoc mB) =
TermDoc (Map.unionWith IntSet.union mA mB)
instance Monoid TermDoc where
mempty = TermDoc mempty
addTerm :: Int -> Text -> TermDoc -> TermDoc
addTerm doc term td =
TermDoc
. Map.insertWith IntSet.union term (IntSet.singleton doc)
$ unTermDoc td
data Index = Index
{ termdoc :: !TermDoc
, docs :: !(IntMap Document)
}
deriving Show
instance Semigroup Index where
(Index tdA dsA) <> (Index tdB dsB) = Index
{ termdoc = tdA <> tdB
, docs = dsA <> dsB
}
instance Monoid Index where
mempty = Index mempty mempty
fromDocument :: Document -> Index
fromDocument doc = Index
{ docs =
IntMap.singleton (Document.id doc) doc
, termdoc =
foldl' (flip (addTerm (Document.id doc)))
mempty
(Analysis.analyse (Document.fullText doc))
}
{-
Built in main with
index <- runConduitRes $
documents
.| takeC 100000
.| mapC Index.fromDocument
.| foldlC (<>) mempty
-}