module Index where import Data.HashMap.Strict (Map) import qualified Data.HashMap.Strict as Map import Data.IntMap.Strict (IntMap) import qualified Data.IntMap.Strict as IntMap import Data.IntSet (IntSet) import qualified Data.IntSet as IntSet import Data.List (foldl') import Data.Maybe (catMaybes, fromMaybe) import Data.Text (Text) import qualified Analysis as Analysis import Document (Document) import qualified Document as Document newtype TermDoc = TermDoc { unTermDoc :: Map Text IntSet } deriving Show instance Semigroup TermDoc where (TermDoc mA) <> (TermDoc mB) = TermDoc (Map.unionWith IntSet.union mA mB) instance Monoid TermDoc where mempty = TermDoc mempty addTerm :: Int -> Text -> TermDoc -> TermDoc addTerm doc term td = TermDoc . Map.insertWith IntSet.union term (IntSet.singleton doc) $ unTermDoc td data Index = Index { termdoc :: !TermDoc , docs :: !(IntMap Document) } deriving Show instance Semigroup Index where (Index tdA dsA) <> (Index tdB dsB) = Index { termdoc = tdA <> tdB , docs = dsA <> dsB } instance Monoid Index where mempty = Index mempty mempty fromDocument :: Document -> Index fromDocument doc = Index { docs = IntMap.singleton (Document.id doc) doc , termdoc = foldl' (flip (addTerm (Document.id doc))) mempty (Analysis.analyse (Document.fullText doc)) } {- Built in main with index <- runConduitRes $ documents .| takeC 100000 .| mapC Index.fromDocument .| foldlC (<>) mempty -}