Source code for docria.collection

# -*- coding: utf-8 -*-
#
# Copyright 2021 Marcus Klang (marcus.klang@cs.lth.se)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""I/O module, read/write collections of documents"""
import os
from io import BytesIO, RawIOBase, SEEK_SET, SEEK_CUR, SEEK_END
from msgpack import Unpacker, Packer
from typing import Optional, Callable, Dict, Union, List, Tuple, TYPE_CHECKING
import zlib
import bz2
import lzma
from typing import Iterator
import struct
import importlib.util
from docria.model import Document
from docria.codec import MsgpackDocument
import tarfile
import time


def _module_available(name):
    try:
        return importlib.util.find_spec(name) is not None
    except ModuleNotFoundError:
        return False


class _BoundaryReader(RawIOBase):
    """Note: Seek is not ready for prime time yet, unresolved bugs."""
    def __init__(self, inputio):
        super().__init__()

        self.inputio = inputio  # type: RawIOBase
        self.boundary = self.inputio.read(1)[0]
        if self.boundary < 12 or self.boundary > 30:
            raise IOError("Incorrect boundary value: %d" % self.boundary)

        self.offset = 0

    def close(self):
        return self.inputio.close()

    def seekable(self):
        return self.inputio.seekable()

    def _to_absolute(self, offset):
        boundary_val = 1 << self.boundary
        if offset < boundary_val - 1:
            return offset + 1
        else:
            upper = offset - boundary_val + 1
            return (upper // (boundary_val-4))*4 + boundary_val + 4 + upper

    def seek(self, offset: int, whence: int = SEEK_SET) -> int:
        if whence != SEEK_SET:
            raise NotImplementedError()

        self.offset = offset
        abs_start_position = self._to_absolute(offset)
        pos = self.inputio.seek(abs_start_position, SEEK_SET)
        assert pos == abs_start_position
        return offset

    def read(self, n=-1):
        abs_start_position = ((self.offset + 1) >> self.boundary)*4+(self.offset + 1)
        if n == -1:
            alldata = self.inputio.read(n)
            abs_stop_position = abs_start_position + len(alldata)
        else:
            abs_stop_position = ((self.offset + n + 1) >> self.boundary)*4+(self.offset + n + 1)
            alldata = self.inputio.read(abs_stop_position-abs_start_position)
            abs_stop_position = abs_start_position + len(alldata)

        if len(alldata) == 0:
            return alldata

        if (abs_start_position >> self.boundary) << self.boundary == abs_start_position:
            abs_start_position += 4

        num_boundaries = (abs_stop_position >> self.boundary) - (abs_start_position >> self.boundary)
        real_length = abs_stop_position-abs_start_position-num_boundaries*4

        if (abs_stop_position >> self.boundary) << self.boundary == abs_stop_position:
            real_length -= 4

        output = bytearray(real_length)

        output_position = 0
        rel_position = 0
        current_position = abs_start_position

        while current_position < abs_stop_position:
            max_read = min(((current_position >> self.boundary) + 1) << self.boundary, abs_stop_position)-current_position

            output[output_position:output_position+max_read] = alldata[rel_position:rel_position+max_read]
            rel_position += 4 + max_read
            current_position += max_read + 4
            output_position += max_read

        assert output_position == real_length
        self.offset += real_length

        return bytes(output)

    def tell(self):
        return self.inputio.tell()-(self.inputio.tell() >> self.boundary)*4-1

    def readable(self):
        return True

    def writable(self):
        return False


class _BoundaryWriter(RawIOBase):
    def __init__(self, outputio, boundary=20, **kwargs):
        super().__init__()
        self.outputio = outputio  # type: RawIOBase
        if 12 <= boundary <= 30:
            self.boundary = boundary
        else:
            raise ValueError("Got invalid boundary value: %d, "
                             "valid value is 12 to 30 which represents 2^12 (4 kiB) to 2^30 (1 GiB)" % boundary)

        self.outputio.write(bytes([boundary]))

        self.written = 1
        self.lastsplit = 1
        self.seg = 0

    def writable(self):
        return True

    def readable(self, *args, **kwargs):
        return False

    def seekable(self, *args, **kwargs):
        return False

    def split(self):
        self.lastsplit = self.written

    def _write_boundary(self):
        assert (self.written >> self.boundary) << self.boundary == self.written
        delta = self.lastsplit - self.written

        deltapos = delta
        if delta <= -0x80000000:
            deltapos = -0x80000000

        self.outputio.write(struct.pack(">i", deltapos))

        self.written += 4
        self.seg += 1

    def write(self, data):
        pos = 0
        left = len(data)

        maxwrite = min(((self.seg + 1) << self.boundary) - self.written, left)
        while left > 0:
            self.outputio.write(data[pos:pos+maxwrite])
            pos += maxwrite
            self.written += maxwrite
            left -= maxwrite

            if left > 0:
                self._write_boundary()
                maxwrite = min(((self.seg + 1) << self.boundary) - self.written, left)

    def close(self):
        self.outputio.close()


[docs]class MsgpackDocumentBlock:
    """
    Represents a block of MessagePack docria documents

    .. automethod:: __iter__
    .. automethod:: __next__
    """
[docs]    def __init__(self, position: int, rawbuffer: bytes):
        self._dataread = 0
        self._data = BytesIO(rawbuffer)
        self._unpacker = Unpacker(self._data, raw=False)
        self._position = position

    @property
    def position(self)->int:
        """Get the original byte position"""
        return self._position

[docs]    def tell(self)->int:
        """Get the current byte position within this block"""
        return self._unpacker.tell()+self._dataread

    def seek(self, position)->int:
        pos = self._data.seek(position)
        self._dataread = position
        self._unpacker = Unpacker(self._data, raw=False)
        return pos

[docs]    def documents(self)->List[Tuple[int, MsgpackDocument]]:
        """Return all documents as a list of tuples (position, MessagePack Docria document)"""
        docs = []

        last = 0
        self.seek(0)
        for doc in self:
            docs.append((last, doc))
            last = self.tell()

        return docs

[docs]    def __iter__(self):
        """:returns: self"""
        return self

[docs]    def __next__(self):
        """:returns: MsgpackDocument with the encoded document"""
        from docria.codec import MsgpackDocument

        blockpos = self.tell()
        data = next(self._unpacker, None)
        if data is None:
            raise StopIteration()
        else:
            return MsgpackDocument(data, ref=(self._position, blockpos))


class CompressionCodec:
    def __init__(self, name, compress: Callable[[bytes],bytes], decompress: Callable[[bytes],bytes]):
        self.name = name
        self.compress = compress
        self.decompress = decompress


_Name2Codec = {}


def register_codec(name, compress: Callable[[bytes], bytes], decompress: Callable[[bytes], bytes], codec_name=None):
    if name in _Name2Codec:
        raise ValueError(f"Codec {name} already registered")

    _Name2Codec[name] = CompressionCodec(codec_name if codec_name is not None else name, compress, decompress)


def unregister_codec(name):
    del _Name2Codec[name]


def get_codec(name, no_except=False)->Union[CompressionCodec, None]:
    if not no_except:
        if name not in _Name2Codec:
            raise NotImplementedError(f"Codec {name} is not implemented.")

    return _Name2Codec.get(name)


register_codec("none", lambda x: x, lambda x: x)
register_codec("zip", zlib.compress, zlib.decompress)
register_codec("bzip2", bz2.compress, bz2.decompress)
register_codec("zipsq", lambda x: zlib.compress(
                    zlib.compress(x, level=zlib.Z_BEST_COMPRESSION), level=zlib.Z_BEST_COMPRESSION),
               lambda x: zlib.decompress(zlib.decompress(x)))
register_codec("lzma", lzma.compress, lzma.decompress)

if not TYPE_CHECKING and _module_available("lz4.frame"):
    try:
        import lz4.frame
        register_codec("lz4",
                       lambda x: lz4.frame.compress(x,
                                                    block_size= lz4.frame.BLOCKSIZE_MAX4MB,
                                                    compression_level=lz4.frame.COMPRESSIONLEVEL_MAX),
                       lz4.frame.decompress)
    except ModuleNotFoundError:
        pass


if not TYPE_CHECKING and _module_available("zstd"):
    try:
        import zstd
        register_codec("zstd", lambda x: zstd.compress(x), zstd.decompress)
        register_codec("zstd:high", lambda x: zstd.compress(x, 9), zstd.decompress, codec_name="zstd")
        register_codec("zstd:ultra", lambda x: zstd.compress(x, 22), zstd.decompress, codec_name="zstd")
    except ModuleNotFoundError:
        pass


[docs]class MsgpackDocumentReader:
    """Reader for the blocked MessagePack document file format"""
[docs]    def __init__(self, inputio: Union[RawIOBase,str]):
        """
        Construct a document reader

        :param inputio: path to a docria file for reading or a file object (e.g. object returned by open)
        """
        if isinstance(inputio, str):
            inputio = open(inputio, mode="rb")

        self.inputio = inputio
        self.dataread = 4
        header = self.inputio.read(4)
        if header != b"Dmf1":
            raise IOError("Header does not match expected format 'Dmf1', found: %s" % header.decode("latin1"))

        self.unpacker = Unpacker(self.inputio, raw=False)

        codecname = next(self.unpacker)
        self.codec = get_codec(codecname).decompress

        self.num_doc_per_block = next(self.unpacker)
        self.advanced = next(self.unpacker)
        if self.advanced:
            raise NotImplementedError("Advanced mode not implemented.")

        self.block = None  # type: MsgpackDocumentBlock
        self._lastblockpos = inputio.tell()

    def __iter__(self)->Iterator[MsgpackDocument]:
        return self

    def __enter__(self)-> "MsgpackDocumentReader":
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.inputio.close()

[docs]    def get(self, ref):
        """
        Returns a specific document at position (file position, block position)

        :param ref: tuple of (raw file position, uncompressed block position)

        :return: MessagePack document instance

        :note:
        This method assumes and requires that the underlying I/O supports seeking.
        """
        if self.block is None or self.block.position != ref[0]:
            self.seek(ref[0])
            self.block = self.readblock()

        self.block.seek(ref[1])
        return next(self.block)

[docs]    def seek(self, position):
        """
        Seek to a block position

        :param position: raw file position

        :note:
        This method assumes and requires that the underlying I/O supports seeking.
        """
        self.block = None
        self.inputio.seek(position, SEEK_SET)
        self.unpacker = Unpacker(self.inputio, raw=False)
        self.dataread = position

[docs]    def blocks(self):
        """Get iterator for all document blocks"""
        while True:
            bl = self.readblock()
            if bl is None:
                return
            else:
                yield bl

[docs]    def readblock(self)->Optional[MsgpackDocumentBlock]:
        """Read a single block if possible"""
        self._lastblockpos = self.unpacker.tell() + self.dataread
        data = next(self.unpacker, None)
        if data is None:
            return None
        else:
            buf = self.codec(data)
            blk = MsgpackDocumentBlock(self._lastblockpos, buf)
            return blk

    def __next__(self)->MsgpackDocument:
        if self.block is not None:
            start = self.block.tell()
            doc = next(self.block._unpacker, None)
            if doc is None:
                self.block = None
            else:
                return MsgpackDocument(doc, ref=(self._lastblockpos, start))

        while self.block is None:
            datablock = self.readblock()
            if datablock is None:
                raise StopIteration()

            self.block = datablock
            start = self.block.tell()
            doc = next(self.block._unpacker, None)
            if doc is None:
                self.block = None
            else:
                return MsgpackDocument(doc, ref=(self._lastblockpos, start))

    def close(self):
        self.inputio.close()


[docs]class DocumentReader:
    """Utility reader, returns Docria documents."""
[docs]    def __init__(self, inputreader):
        self.inputreader = inputreader

    def __enter__(self)-> "MsgpackDocumentReader":
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.inputreader.close()

    def __iter__(self):
        return self

    def __next__(self):
        doc = next(self.inputreader, None)
        if doc is None:
            raise StopIteration()

        return doc.document()


[docs]class MsgpackDocumentWriter:
    """Writer for the blocked MessagePack document file format"""
[docs]    def __init__(self,
                 outputio: Union[RawIOBase, str],
                 num_docs_per_block=128,
                 codec=get_codec("zip"),
                 mode="xb",
                 **kwargs):
        """
        Construct a document writer.

        If a string path is provided, mode xb is used, meaning it will fail if a file exist.

        :param outputio: path to a new docria file to write to or an file object
        :param num_docs_per_block: the number of documents to cache before
                                   compressing the entire block and write to underlying storage.
        :param codec: the compression codec to use for blocks
        :param mode: if outputio is a string path, the mode to use, by default xb
        """
        if isinstance(outputio, str):
            outputio = open(outputio, mode=mode)

        self.outputio = outputio
        self.packer = Packer(use_bin_type=True)

        self.outputio.write(b"Dmf1")
        self.outputio.write(self.packer.pack(codec.name))
        self.outputio.write(self.packer.pack(num_docs_per_block))
        self.outputio.write(self.packer.pack(False))

        if isinstance(self.outputio, _BoundaryWriter):
            self.outputio.split()

        self.currentblock = BytesIO()
        self.current_block_count = 0
        self.codec_name = codec.name
        self.codec = codec.compress  # type: Callable[[bytes], bytes]
        self.num_docs_per_block = num_docs_per_block

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

[docs]    def write(self, doc: Union[Document,MsgpackDocument], **kwargs):
        """
        Write docria document

        :param doc: accepts unencoded Document or Messagepack Document for fast writing
        :param kwargs: options to pass to :meth:`docria.codec.MsgpackCodec.encode`
        """
        from docria.codec import MsgpackCodec

        if isinstance(doc, Document):
            data = MsgpackCodec.encode(doc, **kwargs)
        elif isinstance(doc, MsgpackDocument):
            data = doc.rawdata.getvalue()
        else:
            raise ValueError("Got unsupported doc, only Document and MsgpackDocument allowed")

        self.currentblock.write(self.packer.pack(data))
        self.current_block_count += 1

        if self.current_block_count == self.num_docs_per_block:
            self.flush()

[docs]    def flush(self):
        """
        Flush data to the underlying storage.

        :note:
        Will force currently cached blocks to be compressed and written to disk.
        This might result in blocks having less than specified number of documents per block.
        """
        if self.current_block_count > 0:
            self.outputio.write(self.packer.pack(self.codec(self.currentblock.getvalue())))

            if isinstance(self.outputio, _BoundaryWriter):
                self.outputio.split()

            self.currentblock = BytesIO()
            self.current_block_count = 0

[docs]    def close(self):
        """Flush data and close the underlying storage"""
        self.flush()
        self.outputio.close()


[docs]class TarMsgpackReader:
    """Reader for the tar-based sequential MessagePack format."""
[docs]    def __init__(self, inputpath, mode="r|gz", **kwargs):
        """
        TarMsgpackReader constructor

        :param inputpath: filepath to tar
        :param mode: the tarball reading mode, :meth:`tarfile.open`, \
                     can be used to select bz2 or lzma compression modes.
        """
        self.tarreader = tarfile.open(inputpath, mode=mode)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.tarreader.close()

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            ti = self.tarreader.next()  # type: tarfile.TarInfo
            if ti is None:
                raise StopIteration

            if ti.isfile():
                obj = self.tarreader.extractfile(ti)
                return MsgpackDocument(obj.read())

    def close(self):
        self.tarreader.close()


[docs]class TarMsgpackWriter:
    """Writer for the tar-based sequential MessagePack format."""
[docs]    def __init__(self, outputpath, docformat="doc%05d.msgpack", rootdir=None, mode="w|gz", **kwargs):
        """
        TarMsgpackWriter

        :param outputpath: filepath to tar
        :param docformat: naming convention of files in the tarball,
        must include a single digit using old-style string formatting.
        :param rootdir: set to string if a root directory within the tarfile should be used.
        :param mode: the tarball writing mode, :meth:`tarfile.open`, \
                     can be used to select bz2 or lzma compression modes.
        """
        self.tarwriter = tarfile.open(outputpath, mode=mode, **kwargs)
        self.rootdir = rootdir
        self.docformat = docformat
        if rootdir is not None:
            ti = tarfile.TarInfo(rootdir)
            ti.type = tarfile.DIRTYPE
            ti.mtime = time.time()

            self.tarwriter.addfile(ti)

        self.i = 0

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

[docs]    def write(self, doc: Union[Document, MsgpackDocument]):
        """
        Write document

        :param doc: accepts unencoded Document, and encoded MsgpackDocument for fast conversion.
        """
        from docria.codec import MsgpackCodec
        if self.rootdir is not None:
            ti = tarfile.TarInfo(os.path.join(self.rootdir, self.docformat % self.i))
        else:
            ti = tarfile.TarInfo(self.docformat % self.i)

        ti.mtime = time.time()

        if isinstance(doc, Document):
            data = MsgpackCodec.encode(doc)
        elif isinstance(doc, MsgpackDocument):
            data = doc.rawdata.getvalue()
        else:
            raise ValueError("Got unsupported doc, only Document and MsgpackDocument allowed")

        ti.size = len(data)
        self.i += 1

        self.tarwriter.addfile(ti, fileobj=BytesIO(data))

    def close(self):
        self.tarwriter.close()


[docs]class DocumentIO:
    """
    .. deprecated::
        Use concrete variants instead such as MsgpackDocumentIO
    """
    @staticmethod
    def write(filepath, **kwargs)->MsgpackDocumentWriter:
        return MsgpackDocumentWriter(open(filepath, "wb"), **kwargs)

    @staticmethod
    def writefile(filelike: RawIOBase, **kwargs)->MsgpackDocumentWriter:
        return MsgpackDocumentWriter(filelike, **kwargs)

    @staticmethod
    def read(filepath, **kwargs)->DocumentReader:
        return DocumentIO.readfile(open(filepath, "rb"), **kwargs)

    @staticmethod
    def readfile(filelike: RawIOBase, **kwargs)->DocumentReader:
        return DocumentReader(MsgpackDocumentReader(filelike))


[docs]class MsgpackDocumentIO:
    """
    MessagePack Document I/O class
    """
[docs]    @staticmethod
    def read(filepath, **kwargs)->MsgpackDocumentReader:
        """
        Read a document collection
        :param filepath: the source filepath
        :param kwargs: arguments for reading
        :return: reader for collection
        """
        return MsgpackDocumentIO.readfile(open(filepath, "rb"))

[docs]    @staticmethod
    def readfile(filelike, **kwargs)->MsgpackDocumentReader:
        """
        Read a document collection from file-like object
        :param filelike: the file like reader
        :param kwargs: arguments for reading
        :return: reader for collection
        """
        return MsgpackDocumentReader(filelike)


[docs]class DocumentFileIndex:
    """In-memory index of a single docria file"""
[docs]    def __init__(self, filepath: str,
                 properties: Dict[str, Dict[any, List[int]]],
                 docrefs: List[Tuple[int, int]]):
        """
        Constructor of DocumentFileIndex

        :param filepath: path to MessagePack Document file
        :param properties: the property index, dictin
        :param docrefs: list of document references
        """
        self.filepath = filepath
        self.properties = properties
        self.docrefs = docrefs

    @staticmethod
    def build(source_filepath, *properties, **kwargs):
        reader = MsgpackDocumentIO.read(source_filepath, **kwargs)
        property2docis = {prop: {} for prop in properties}
        docs = []

        for docid, doc in enumerate(doc for block in reader.blocks() for doc in block):
            docs.append(doc.ref)
            props = doc.properties()
            for prop in properties:
                if prop in props:
                    property2docis[prop].setdefault(props[prop], []).append(docid)

        reader.close()

        return DocumentFileIndex(source_filepath, property2docis, docs)

    def search(self, conds, lazy=False):
        if len(conds) == 0:
            return

        all_hits = []
        for k, v in dict(conds).items():
            all_hits.append(set(self.properties.get(k, {}).get(v, [])))

        results = all_hits[0]
        for s in all_hits[1:]:
            results.intersection_update(s)

        from docria.codec import MsgpackCodec

        if len(results) > 0:
            reader = MsgpackDocumentIO.read(self.filepath)
            lastblk = None

            # Optimized reading if multiple hits exist within a block sequentially
            for docid in sorted(results):
                ref = self.docrefs[docid]
                try:
                    if lastblk is None:
                        reader.seek(ref[0])
                        lastblk = reader.readblock()
                    elif lastblk.position != ref[0]:
                        reader.seek(ref[0])
                        lastblk = reader.readblock()

                    lastblk.seek(ref[1])
                    if lazy:
                        yield next(lastblk)
                    else:
                        yield MsgpackCodec.decode(next(lastblk._unpacker))
                except Exception as e:
                    raise IOError("Failed to read document in %s "
                                  "for ref %d, %d" % (self.filepath, ref[0], ref[1])) from e


[docs]class DocumentIndex:
    """Multi-file in-memory index"""
[docs]    def __init__(self, basepath="."):
        self.basepath = os.path.abspath(basepath)
        self.index = {}  # type: Dict[str, DocumentFileIndex]

    def add(self, index: "DocumentFileIndex"):
        index.filepath = os.path.relpath(index.filepath, self.basepath)
        self.index[index.filepath] = index

    def search(self, conds, lazy=False):
        for indx in self.index.values():
            for doc in indx.search(conds, lazy=lazy):
                yield doc

[docs]    def save(self, path):
        """Save index as a pickle file"""
        import pickle
        with open(path, "wb") as fout:
            pickle.dump(self, fout, pickle.HIGHEST_PROTOCOL)

[docs]    @staticmethod
    def load(path):
        """Load pickle index"""
        import pickle
        with open(path, "rb") as fin:
            return pickle.load(fin)


def _build_file_index(args):
    path = args["path"]
    props = args["props"]
    return DocumentFileIndex.build(path, *props)


[docs]def build_msgpack_fileindex(path, *props)->"DocumentFileIndex":
    """
    Construct a document index

    :param path: path to file which can be read by :class:`~docria.storage.MsgpackDocumentReader`
    :param props: the properties to index

    :return: built index
    """
    return DocumentFileIndex.build(path, *props)


[docs]def build_msgpack_directory_fileindex(path, *props, basepath=".", num_workers=None)->"DocumentIndex":
    """
    Construct a document index spanning over multiple docria files.

    :param path: path to the directory containing docria files
    :param props: the properties to index
    :param basepath: the relative path to use when saving filepath locations
    :param num_workers: the number of processes to spawn for multicore processing of files,
    default is the number of cores available as given by :meth:`multiprocessing.cpu_count`.

    :return: populated DocumentIndex

    :note:
    basepath can be used to create an index which only has relative references and thus can be
    included with the document collection.
    """
    from multiprocessing import Pool, cpu_count
    import re
    namefilter = re.compile(r"^[^.]+?\.docria(\.(xz|bz2|z|lz4))?$")

    docria_files = [os.path.join(path, fpath) for fpath in os.listdir(path)
                    if namefilter.fullmatch(fpath) is not None]
    master_indx = DocumentIndex(basepath=basepath)

    proplist = list(props)
    num_workers = cpu_count() if num_workers is None else num_workers

    with Pool(processes=num_workers) as p:
        if _module_available("tqdm"):
            from tqdm import tqdm

            with tqdm("Building index", total=len(docria_files)) as pbar:
                for indxitem in p.imap_unordered(_build_file_index, [{"path": path, "props": proplist}
                                                                     for path in docria_files]):
                    master_indx.add(indxitem)
                    pbar.update(1)

        else:
            for indxitem in p.imap_unordered(_build_file_index, [{"path": path, "props": proplist}
                                                                 for path in docria_files]):
                master_indx.add(indxitem)

    return master_indx