Models

`snoop.data.models` #

Django model definitions file.

Also see snoop.data.collections for details on how models are bound to the different databases.

Classes#

`Blob` #

Database model for storing binary objects, their hashes, and mime types.

Every file that gets ingested by Hoover is cloned as a Blob and referenced in this table. Since the primary key is the hash of the data, all documents are de-duplicated.

Intermediary results (like converted files, extracted files, JSON responses from other libraries and services, and the Digests, also in JSON) are also stored using this system, with no namespace separation. This means all our intermediary tasks tend to be de-duplicated too.

Attributes#

`collection_source_key` #

If this is set, we store and retrieve the file using this key from the collections S3 instead of the default blobs S3.

`content_type` `property` `readonly` #

Returns a web-friendly content type string (for the HTTP header).

`date_created` #

Auto-managed timestamp.

`date_modified` #

Auto-managed timestamp.

`magic` #

mime description given by libmagic (man 1 file).

`md5` #

hash of content

`mime_encoding` #

mime encoding given by libmagic, for text files.

`mime_type` #

mime type given by libmagic.

`sha1` #

hash of content

`sha256` #

hash of content

`sha3_256` #

hash of content (primary key)

`size` #

size of content, bytes.

Methods#

`repr(self)` `special` #

The string representation for a Blob is just its PK hash.

Source code in snoop/data/models.py

def __str__(self):
    """The string representation for a Blob is just its PK hash.
    """
    the_str = truncatechars(self.pk, 10)
    return f'Blob({the_str})'

`str(self)` `special` #

The string representation for a Blob is just its PK hash.

Source code in snoop/data/models.py

def __str__(self):
    """The string representation for a Blob is just its PK hash.
    """
    the_str = truncatechars(self.pk, 10)
    return f'Blob({the_str})'

`create(cls, fs_path = None)` `classmethod` #

Context manager used for creating Blobs.

Parameters:

Name	Type	Description	Default
`fs_path`		optional filesystem path to file to get a more accurate reading for the mime type. If absent, the mime type will only be guessed from the data, without the help of the extension. Libmagic can't properly guess some vintage Microsoft formats without the extensions present.	`None`

Yields

snoop.data.models.BlobWriter -- Use .write(byte_string) on the returned object until finished. The final result can be found at .blob on the same object, after exiting this contextmanager's context.

Source code in snoop/data/models.py

@classmethod
@contextmanager
def create(cls, fs_path=None):
    """Context manager used for creating Blobs.

    Args:
        fs_path: optional filesystem path to file to get a more accurate
            reading for the mime type. If absent, the mime type will only
            be guessed from the data, without the help of the extension.
            Libmagic can't properly guess some vintage Microsoft formats
            without the extensions present.

    Yields:
        [snoop.data.models.BlobWriter][] -- Use `.write(byte_string)` on the returned object until
        finished. The final result can be found at `.blob` on the same object, after exiting this
        contextmanager's context.
    """

    fields = {}
    if fs_path:
        m = Magic(fs_path)
        fields = m.fields
    with tempfile.NamedTemporaryFile(prefix='new-blob-', delete=False) as f:
        writer = BlobWriter(f)
        yield writer

    fields.update(writer.finish())
    pk = fields.pop('sha3_256')

    temp_blob_path = Path(f.name)
    temp_blob_path.chmod(0o444)

    if not fs_path:
        m = Magic(temp_blob_path)
        fields.update(m.fields)

    settings.BLOBS_S3.fput_object(
        collections.current().name,
        blob_repo_path(pk),
        temp_blob_path,
    )

    (blob, _) = cls.objects.get_or_create(pk=pk, defaults=fields)
    writer.blob = blob

    os.remove(temp_blob_path)

`create_from_bytes(data)` `classmethod` #

Create a Blob from a single byte string.

Useful when objects are in memory, for example when parsing email.

Parameters:

Name	Type	Description	Default
`data`		the byte string to be stored	required

Source code in snoop/data/models.py

@classmethod
def create_from_bytes(cls, data):
    """Create a Blob from a single byte string.

    Useful when objects are in memory, for example when parsing email.

    Args:
        data: the byte string to be stored
    """
    sha3_256 = hashlib.sha3_256()
    sha3_256.update(data)

    try:
        b = Blob.objects.get(pk=sha3_256.hexdigest())
        return b

    except ObjectDoesNotExist:
        with cls.create() as writer:
            writer.write(data)
        return writer.blob

`create_from_file(path, collection_source_key = None)` `classmethod` #

Create a Blob from a file on disk.

Since we know it has a stable path on disk, we take the luxury of reading it twice. We read it once to compute only the primary key hash, then close it, and if this is a new file, we reopen it and read the data.

Parameters:

Name	Type	Description	Default
`path`		string or Path to read from.	required
`collection_source_key`		if set, will use the collection source bucket as storage.	`None`

Source code in snoop/data/models.py

@classmethod
def create_from_file(cls, path, collection_source_key=None):
    """Create a Blob from a file on disk.

    Since we know it has a stable path on disk, we take the luxury of
    reading it **twice**. We read it once to compute only the primary key
    hash, then close it, and if this is a new file, we reopen it and read
    the data.

    Args:
        path: string or Path to read from.
        collection_source_key: if set, will use the collection source bucket as storage.
    """
    path = Path(path).resolve().absolute()
    writer = BlobWriter()
    with open(path, 'rb') as f:
        for block in chunks(f):
            writer.write(block)
    fields = writer.finish()
    pk = fields.pop('sha3_256')

    try:
        b = Blob.objects.get(pk=pk)
        if collection_source_key and not b.collection_source_key:
            # delete this from minio and override/save new key
            try:
                settings.BLOBS_S3.remove_object(collections.current().name, blob_repo_path(b.pk))
                logger.info('successfully deleted object from s3.')
            except Exception as e:
                logger.exception(e)
                logger.error('failed to delete object from s3.')

            b.collection_source_key = collection_source_key
            b.save()
            return b

        # ensure the S3 object still exists by checking it
        try:
            stat = settings.BLOBS_S3.stat_object(
                collections.current().name,
                blob_repo_path(pk),
            )
            assert stat is not None, 'empty stat'
        except Exception as e:
            logger.warning('error getting stat (%s); re-uploading blob %s...', str(e), pk)
            settings.BLOBS_S3.fput_object(
                collections.current().name,
                blob_repo_path(pk),
                path,
            )
        return b

    except ObjectDoesNotExist:
        if collection_source_key:
            m = Magic(path)
            fields.update(m.fields)
            fields['collection_source_key'] = collection_source_key
            (blob, _) = cls.objects.get_or_create(pk=pk, defaults=fields)
            return blob

        with cls.create(path) as writer:
            with open(path, 'rb') as f:
                for block in chunks(f):
                    writer.write(block)

        return writer.blob

`create_json(data)` `classmethod` #

Create a Blob containing JSON encoded data from the given Python dict object.

Source code in snoop/data/models.py

@classmethod
def create_json(cls, data):
    """Create a Blob containing JSON encoded data from the given Python dict object."""
    return cls.create_from_bytes(json.dumps(data, indent=1).encode('utf-8'))

`mount_path(self)` #

Mount this blob under some temporary directory using s3fs-fuse / fuse-7z-ng and return its path.

Source code in snoop/data/models.py

@contextmanager
def mount_path(self):
    """Mount this blob under some temporary directory using s3fs-fuse / fuse-7z-ng and return its
    path."""

    if self.collection_source_key:
        with collections.current().mount_collections_root() as collection_root:
            key_str = self.collection_source_key.tobytes().decode('utf-8', errors='surrogateescape')
            yield os.path.join(collection_root,
                               key_str)

    else:
        with collections.current().mount_blobs_root() as blobs_root:
            key = blob_repo_path(self.pk)
            yield os.path.join(blobs_root, key)

`open(self, need_seek = False, need_fileno = False)` #

Open this Blob's data storage for reading. Mode is always 'rb'.

Parameters:

Name	Type	Description	Default
`-`	`need_seek`	if the returned file object requires `f.seek()`, for example with Python libraries. If this is the only flag set, this is achieved by using the `smart_open` library.	required
`-`	`need_fileno`	if the returned file object requires `f.fileno()`, for example with `subprocess` calls where this is given as standard input. If this is the only flag set, this is achieved by making a local FIFO pipe (`os.mkfifo` and pushing data into that, from a forked process).	required

If both arguments are set to true, then we use mount_path() to get a FUSE filesystem containing the files, and return the file object by opening that path.

Some programs don't even accept any kind of input from stdin, such as 7z with most formats, or pdf2pdfocr.py, which just exits (probably knowing it'll do multiple seek and multiple opens).

In that case, just use the mount_path contextmanager to get a POSIX filesystem path.

Source code in snoop/data/models.py

@contextmanager
def open(self, need_seek=False, need_fileno=False):
    """Open this Blob's data storage for reading. Mode is always 'rb'.

    Args:
        - need_seek: if the returned file object requires `f.seek()`, for example with Python libraries.
            If this is the only flag set, this is achieved by using the `smart_open` library.
        - need_fileno: if the returned file object requires `f.fileno()`, for example with `subprocess`
            calls where this is given as standard input. If this is the only flag set, this is achieved
            by making a local FIFO pipe (`os.mkfifo` and pushing data into that, from a forked process).

    If both arguments are set to `true`, then we use `mount_path()` to get a FUSE filesystem containing
    the files, and return the file object by opening that path.

    Some programs don't even accept any kind of input from stdin, such as `7z` with most formats, or
    `pdf2pdfocr.py`, which just exits (probably knowing it'll do multiple seek and multiple opens).

    In that case, just use the `mount_path` contextmanager to get a POSIX filesystem path.
    """
    # if (need_seek and need_fileno):
    if (need_fileno):
        with self.mount_path() as blob_path:
            yield open(blob_path, mode='rb')
            return

    if self.collection_source_key:
        bucket = collections.current().name
        key = self.collection_source_key.tobytes().decode('utf-8', errors='surrogateescape')
        smart_transport_params = settings.SNOOP_COLLECTIONS_SMART_OPEN_TRANSPORT_PARAMS
        minio_client = settings.COLLECTIONS_S3
    else:
        bucket = collections.current().name
        key = blob_repo_path(self.pk)
        smart_transport_params = settings.SNOOP_BLOBS_SMART_OPEN_TRANSPORT_PARAMS
        minio_client = settings.BLOBS_S3

    if need_seek:
        url = f's3u://{bucket}/{key}'
        yield smart_open(
            url,
            transport_params=smart_transport_params,
            mode='rb',
        )
        return

    # This works on subprocess calls, **but** if the process fails, they hang forever.
    # TODO We need to find an alternative to this, that works good when the process fails.
    # elif need_fileno:
    #     # Supply opened unix pipe. Pipe is written to by fork.
    #     with tempfile.TemporaryDirectory(prefix=f'blob-fifo-{self.pk}-') as d:
    #         fifo = os.path.join(d, 'fifo')
    #         os.mkfifo(fifo, 0o600)
    #         if os.fork() > 0:
    #             logger.info('parent process: call open on fifo')
    #             yield open(fifo, mode='rb')
    #         else:
    #             logger.info('child process: write into fifo')
    #             r = None
    #             try:
    #                 r = minio_client.get_object(bucket, key)
    #                 with open(fifo, mode='wb') as fwrite:
    #                     while (b := r.read(2 ** 20)):
    #                         fwrite.write(b)
    #             finally:
    #                 if r:
    #                     r.close()
    #                     r.release_conn()
    #                 logger.info('child process: exit')
    #                 os._exit(0)
    else:
        r = None
        try:
            r = minio_client.get_object(bucket, key)
            yield r
        finally:
            if r:
                r.close()
                r.release_conn()

`read_json(self)` #

Load a JSON encoded binary into a python dict in memory.

Source code in snoop/data/models.py

def read_json(self):
    """Load a JSON encoded binary into a python dict in memory.
    """
    with self.open() as f:
        return json.load(f)

`update_magic(self)` #

Refreshes the mime type fields by running libmagic on the mounted blob.

Updates the database object if needed.

Source code in snoop/data/models.py

def update_magic(self):
    """Refreshes the mime type fields by running libmagic on the mounted blob.

    Updates the database object if needed.
    """
    with self.mount_path() as blob_path:
        m = Magic(Path(blob_path))
        fields = m.fields
    changed = False
    for k, v in fields.items():
        if v != getattr(self, k):
            setattr(self, k, v)
            changed = True
    if changed:
        self.save()

`BlobWriter` #

Compute binary blob size and hashes while also writing it in a file.

Methods#

`init(self, file = None)` `special` #

Constructor.

Parameters:

Name	Type	Description	Default
`file`		opened file, to write to, optional.	`None`

Source code in snoop/data/models.py

def __init__(self, file=None):
    """Constructor.

    Args:
        file: opened file, to write to, optional.
    """
    self.file = file
    self.hashes = {
        'md5': hashlib.md5(),
        'sha1': hashlib.sha1(),
        'sha3_256': hashlib.sha3_256(),
        'sha256': hashlib.sha256(),
    }
    self.size = 0

`finish(self)` #

Return accumulated counters for size and hashes.

Does not close file given to constructor.

Returns:

Type	Description
`dict`	with fields 'size' and the various hashes

Source code in snoop/data/models.py

def finish(self):
    """Return accumulated counters for size and hashes.

    Does not close file given to constructor.

    Returns:
        dict: with fields 'size' and the various hashes
    """
    fields = {
        name: hash.hexdigest()
        for name, hash in self.hashes.items()
    }
    fields['size'] = self.size
    return fields

`write(self, chunk)` #

Saves a byte string to file, while also updating size and hashes.

Parameters:

Name	Type	Description	Default
`chunk`		byte string to save to file	required

Source code in snoop/data/models.py

def write(self, chunk):
    """Saves a byte string to file, while also updating size and hashes.

    Args:
        chunk: byte string to save to file
    """
    for h in self.hashes.values():
        h.update(chunk)
    if self.file:
        self.file.write(chunk)
    self.size += len(chunk)

`Digest` #

Digest contains all the data we have parsed for a de-duplicated document.

The data is neatly stored as JSON in the "result" blob, ready for quick re-indexing if the need arises.

Attributes#

`blob` #

The de-duplicated Document for which processing has happened.

This corresponds to snoop.data.models.File.blob, not snoop.data.models.File.original.

`extra_result` #

The Blob that contains the result of the digests.index task, encoded as JSON. The field is optional, and required by tasks that depend on the `

This may become huge, so we store it as a Blob instead of a JSON field.

`result` #

The Blob that contains the result of parsing the document, encoded as JSON.

This output is generated by the digests.gather task.

This may become huge, so we store it as a Blob instead of a JSON field.

Methods#

`repr(self)` `special` #

To represent a Digest we use its blob hash and the result hash.

Source code in snoop/data/models.py

def __str__(self):
    """To represent a Digest we use its blob hash and the result hash.
    """
    return f'{self.blob} -> {self.result.pk[:5]}...'

`str(self)` `special` #

To represent a Digest we use its blob hash and the result hash.

Source code in snoop/data/models.py

def __str__(self):
    """To represent a Digest we use its blob hash and the result hash.
    """
    return f'{self.blob} -> {self.result.pk[:5]}...'

`get_etag(self)` #

Compute HTTP ETag header for this Digest. To be used for implementing caching mechanisms.

Source code in snoop/data/models.py

def get_etag(self):
    """Compute HTTP ETag header for this Digest.
    To be used for implementing caching mechanisms."""
    etag = str(self.pk)
    etag += ':'
    if self.result:
        etag += str(self.result.pk)
    etag += ':'
    if self.extra_result:
        etag += str(self.extra_result.pk)
    etag += ':'
    etag += str(self.date_modified)
    etag += ':'
    etag += str(self.date_created)
    etag = etag.encode('utf-8', errors='backslashreplace')
    etag = hashlib.sha1(etag).hexdigest()
    return etag

`Directory` #

Database model for a file directory.

Along with File, this comprises the file tree structure analyzed by Hoover. A Directory can be found in two places: in another Directory, or as the only child of some archive or archive-like file.

Attributes#

`container_file` #

The parent, if it's a file (archive, email-archive or something else), else NULL.

Mutually exclusive with snoop.data.models.Directory.parent_directory.

`name` `property` `readonly` #

Decodes the name of this Directory as UTF-8.

Escapes UTF-8 encoding errors with 'surrogateescape' - this has the advantage that it's reversible, for bad encodings.

`name_bytes` #

Name of directory on disk, as bytes.

We store this as bytes and not as strings because we have to support a multitude of original filesystems and encodings that create mutually invalid results.

`parent` `property` `readonly` #

Returns its parent, be it a File or Directory.

`parent_directory` #

The parent, if it is a directory, or NULL.

Mutually exclusive with snoop.data.models.Directory.container_file.

`path_str` `property` `readonly` #

Returns a string representation of its full path.

Methods#

`repr(self)` `special` #

String representation for this Directory is its full path.

Source code in snoop/data/models.py

def __str__(self):
    """String representation for this Directory is its full path.
    """
    # ensure no display errors by replacing surrogates with backslashes
    name = self.path_str.encode('utf8', errors='surrogateescape')
    name = name.decode('utf8', errors='backslashreplace')
    return truncatechars(name, 70)

`str(self)` `special` #

String representation for this Directory is its full path.

Source code in snoop/data/models.py

def __str__(self):
    """String representation for this Directory is its full path.
    """
    # ensure no display errors by replacing surrogates with backslashes
    name = self.path_str.encode('utf8', errors='surrogateescape')
    name = name.decode('utf8', errors='backslashreplace')
    return truncatechars(name, 70)

`ancestry(item)` #

Yields ancestors until root is found.

Source code in snoop/data/models.py

def ancestry(item):
    """Yields ancestors until root is found.
    """
    while item:
        yield item
        item = item.parent

`root()` `classmethod` #

Get the root of the whole filesystem.

Exceptions:

Type	Description
`DoesNotExist`	if table empty.

Source code in snoop/data/models.py

@classmethod
def root(cls):
    """Get the root of the whole filesystem.

    Raises:
        DoesNotExist: if table empty.
    """
    return cls.objects.filter(
        parent_directory__isnull=True,
        container_file__isnull=True
    ).first()

`DocumentUserTag` #

Table used to store tags made by users.

Both private and public tags are stored here.

Private tags are stored on separate Elasticsearch fields, one field per user. Tags are referenced both by usernames and user UUIDs, since we can't use usernames as parts of the elasticsearch field name (since they can contain characters like dot '.' that cannot be part of a field name).

Attributes#

`blob` `property` `readonly` #

Returns the Blob containing the document for this tag.

`date_indexed` #

Moment when document containing this tag was re-indexed.

`digest` #

Document being tagged.

`field` `property` `readonly` #

Returns the elasticsearch field name for this tag.

`public` #

Boolean that decides type of tag

`tag` #

String with the actual tag.

`user` #

Username, as string (to send back in the API).

`uuid` #

Unique identifier for user, used in elasticsearch field name.

Methods#

`delete(self, *args, **kwargs)` #

Override for re-indexing document targeted by this tag.

Source code in snoop/data/models.py

def delete(self, *args, **kwargs):
    """Override for re-indexing document targeted by this tag.
    """

    super().delete(*args, **kwargs)

    from . import digests
    digests.retry_index(self.blob)

`save(self, *args, **kwargs)` #

Override for re-indexing document targeted by this tag.

Source code in snoop/data/models.py

def save(self, *args, **kwargs):
    """Override for re-indexing document targeted by this tag.
    """

    self.check_tag_name()

    self.date_indexed = None
    super().save(*args, **kwargs)

    from . import digests
    digests.retry_index(self.blob)

`Entity` #

Database model for Entities. Entities have a textfield for their string and a type. Additionally, they may have a parent (if merged), or can be blacklisted (so not shown as entities).

Methods#

`repr(self)` `special` #

Return str(self).

Source code in snoop/data/models.py

def __str__(self):
    return f'entity.{self.type.type}: {self.entity}'

`EntityHit` #

Database model for an entitiy hit. An entity hit is a hit of an entitiy in a text source, which means that the entity was found in the text (more specific between index start and end). The used language model is also stored as a foreign key in order to discern which language model produced the hit.

Methods#

`repr(self)` `special` #

Return str(self).

Source code in snoop/data/models.py

def __str__(self):
    return f'{self.entity}'

`EntityType` #

Database model for an entity type. Per data migration, the following are added automatically.

Methods#

`repr(self)` `special` #

Return str(self).

Source code in snoop/data/models.py

def __str__(self):
    return f'{self.type}'

`File` #

Database modle for a file found in the dataset.

Attributes#

`blob` #

The converted data for this File.

This is usually identical to original, but for some file formats conversion is required before any further processing (like apple email .emlx which is basically .eml with another some binary data prefixed to it).

`ctime` #

Taken from stat() or other sources.

`mtime` #

Taken from stat() or other sources.

`name` `property` `readonly` #

Decodes the name of this File as UTF-8.

Escapes UTF-8 encoding errors with 'surrogateescape' - this has the advantage that it's reversible, for bad encodings.

`name_bytes` #

Name of file on disk, as bytes.

We store this as bytes and not as strings because we have to support a multitude of original filesystems and encodings that create mutually invalid results.

`original` #

The original data found for this File.

`parent` `property` `readonly` #

Returns the ID of the parent directory.

`parent_directory` #

The directory containg this File.

`size` #

Size, taken from stat(), in bytes.

Methods#

`repr(self)` `special` #

String representation for a File is its filename, with non-UTF8 code points escaped with backslashes, truncated.

Source code in snoop/data/models.py

def __str__(self):
    """String representation for a File is its filename,
    with non-UTF8 code points escaped with backslashes, truncated.
    """
    name_bytes = self.name_bytes
    if isinstance(name_bytes, memoryview):
        name_bytes = name_bytes.tobytes()
    the_str = truncatechars(name_bytes.decode('utf8', errors='backslashreplace'), 60)
    return f'File({the_str})'

`str(self)` `special` #

String representation for a File is its filename, with non-UTF8 code points escaped with backslashes, truncated.

Source code in snoop/data/models.py

def __str__(self):
    """String representation for a File is its filename,
    with non-UTF8 code points escaped with backslashes, truncated.
    """
    name_bytes = self.name_bytes
    if isinstance(name_bytes, memoryview):
        name_bytes = name_bytes.tobytes()
    the_str = truncatechars(name_bytes.decode('utf8', errors='backslashreplace'), 60)
    return f'File({the_str})'

`LanguageModel` #

Database model for language models. This can be used to filter for specific results of language models. The language code is the language code of the used language, or 'mlt' for multilingual models. The engine is either spacy or polyglot. The description is the string of the model, for example 'xx_ent_wiki_sm' for the multilingual spacy model which is based on the WikiNER data set.

Methods#

`repr(self)` `special` #

Return str(self).

Source code in snoop/data/models.py

def __str__(self):
    return f'{self.model_name}'

`OcrDocument` #

Database model for External OCR result files found on disk.

Attributes#

`ocr` #

A Blob with the data found (probably text or PDF).

`original_hash` #

The MD5 hash found on filesystem.

The document targeted by this External OCR document is going to have the same MD5.

`source` #

The OcrSource instance this document belongs to.

`text` #

The extracted text for this entry (either read directly, or with pdftotext).

`OcrSource` #

Database model for a directory on disk containing External OCR files.

Attributes#

`name` #

Identifier slug for this External OCR source

A directory called the same way must be present under the "ocr" directory in the collection location.

Methods#

`repr(self)` `special` #

Return str(self).

Source code in snoop/data/models.py

def __str__(self):
    return f"{self.pk}: {self.name}"

`mount_root(self)` #

Returns the absolute path for the External OCR source.

Source code in snoop/data/models.py

@contextmanager
def mount_root(self):
    """Returns the absolute path for the External OCR source.
    """

    with collections.current().mount_collections_root() as collection_root:
        path = Path(collection_root) / 'ocr' / self.name
        assert path.is_dir()
        yield path

`PdfPreview` #

Database model for storing the pdf preview corresponding to a document.

`Statistics` #

Database model for storing collection statistics.

Most statistics queries take a long time to run, so we run them periodically (starting every few minutes, depending on server load).

We store here things like task counts, % task progress status.

Scheduling is done separately, so there's no timestamps here.

Attributes#

`key` #

string identifier for this statistic.

`value` #

JSON with computed result.

`Task` #

Database model for tracking status of the processing pipeline.

Each row in this table tracks an application of a Python function to some arguments. Additional arguments can also be supplied as other Tasks that must run before this one.

Attributes#

`ALL_STATUS_CODES` #

List of all valid status codes.

Todo

We should really change these out for Enums at some point.

`args` #

JSON containing arguments.

`blob_arg` #

If the first argument is a Blob, it will be duplicated here.

Used to optimize fetching tasks, as most tasks will only process one Blob as input.

`broken_reason` #

Identifier with reason for this permanent failure.

`date_finished` #

Moment when task finished running.

Used in logic for retrying old errors and re-running sync tasks.

`date_started` #

Moment when task started running.

This isn't saved on the object when the task actually starts, in order to limit database writes.

`error` #

Text with stack trace, if status is "error" or "broken".

`fail_count` #

The number of times this function has failed in a row.

Used to stop retrying tasks that will never make it.

`func` #

String key for Python function.

Supplied as argument in the decorator snoop.data.tasks.snoop_task.

See snoop.data.tasks for general definition and snoop.data.filesystem, snoop.data.analyzers.init and snoop.data.digests for actual Task implementations.

`log` #

Text with first few KB of logs generated when this task was run.

`result` #

Binary object with result of running the function.

Is set if finished successfully, and if the function actually returns a Blob value.

`status` #

String token with task status; see above.

`STATUS_BROKEN` #

Permanent error.

Used to some known type of breakage, such as: encrypted archives, encrypted PDFs, or if dependencies are in an ERROR state too.

`STATUS_DEFERRED` #

Waiting on some other task to finish.

`STATUS_ERROR` #

Unexpected error.

Might be termporary, might be permanent, we don't know.

`STATUS_PENDING` #

Task either wasn't run yet, or was started but not finished.

Making the difference between pending and running requires a write to happen inside our transaction, so we can't tell from outside the runner anyway.

`STATUS_QUEUED` #

Used for tasks that have been put on the queue.

`STATUS_STARTED` #

Has been started by the worker at some point.

Used to detect when Python process was unexpectedly Killed, e.g. from OOM.

`STATUS_SUCCESS` #

Task finished successfully.

`version` #

The version of the function that ran this task.

Used to re-process data when the code (version number) is changed.

Methods#

`repr(self)` `special` #

String representation for a Task contains its name, args and status.

Source code in snoop/data/models.py

def __str__(self):
    """String representation for a Task contains its name, args and status.
    """
    deps = ''
    prev_set = self.prev_set.all()
    prev_ids = ', '.join(str(t.prev.pk) for t in prev_set)
    deps = '; depends on ' + prev_ids if prev_ids else ''
    the_args = str([truncatechars(str(x), 12) for x in self.args])
    return f'Task #{self.pk} {self.func}({the_args}{deps}) [{self.status}]'

`str(self)` `special` #

String representation for a Task contains its name, args and status.

Source code in snoop/data/models.py

def __str__(self):
    """String representation for a Task contains its name, args and status.
    """
    deps = ''
    prev_set = self.prev_set.all()
    prev_ids = ', '.join(str(t.prev.pk) for t in prev_set)
    deps = '; depends on ' + prev_ids if prev_ids else ''
    the_args = str([truncatechars(str(x), 12) for x in self.args])
    return f'Task #{self.pk} {self.func}({the_args}{deps}) [{self.status}]'

`size(self)` #

Returns task size in bytes. Includes blob argument size, JSON argument size, and all dependency result blob sizes, all added up.

Source code in snoop/data/models.py

def size(self):
    """Returns task size in bytes.
    Includes blob argument size, JSON argument size, and all dependency result blob sizes, all added up.
    """
    s = len(json.dumps(self.args))
    if self.blob_arg:
        s += self.blob_arg.size

    for dep in self.prev_set.all():
        if dep.prev.result:
            s += dep.prev.result.size

    return s

`update(self, status = None, error = None, broken_reason = None, log = None, version = None)` #

Helper method to update multiple fields at once, without saving.

This method also truncates our Text fields to decent limits, so it's preferred to use this instead of the fields directly.

Parameters:

Name	Description	Default
`status`	field to set, if not None	`None`
`error`	field to set, if not None	`None`
`broken_reason`	field to set, if not None	`None`
`log`	field to set, if not None	`None`
`version`	field to set, if not None	`None`

Source code in snoop/data/models.py

def update(self, status=None, error=None, broken_reason=None, log=None, version=None):
    """Helper method to update multiple fields at once, without saving.

    This method also truncates our Text fields to decent limits, so it's
    preferred to use this instead of the fields directly.

    Args:
        status: field to set, if not None
        error: field to set, if not None
        broken_reason: field to set, if not None
        log: field to set, if not None
        version: field to set, if not None
    """
    def _escape(s):
        """Escapes non-printable characters as \\XXX.

        Args:
            s: string to escape
        """
        def _translate(x):
            """Turns non-printable characters into \\XXX, prerves the rest.

            Args:
                x:
            """
            if x in string.printable:
                return x
            return f'\\{ord(x)}'
        return "".join(map(_translate, s))

    old_version = self.version
    if version is not None:
        self.version = version

    if status is not None:
        self.status = status

    if error is not None:
        self.error = _escape(error)[:2**13]  # 8k of error screen
    if broken_reason is not None:
        self.broken_reason = _escape(broken_reason)[:2**12]  # 4k reason
    if log is not None:
        self.log = _escape(log)[:2**14]  # 16k of log space

    # Increment fail_count only if we ran the same version and still got a bad status code.
    # Reset the fail count only when status is success, or if the version changed.
    if self.status == self.STATUS_SUCCESS or old_version != self.version:
        self.fail_count = 0
    elif self.status in [self.STATUS_BROKEN, self.STATUS_ERROR]:
        self.fail_count = self.fail_count + 1

`TaskDependency` #

Database model for tracking which Tasks depend on which.

Attributes#

`name` #

a string used to identify the kwarg name of this dependency

`next` #

the task that depends on prev

`prev` #

the task needed by another task

Methods#

`repr(self)` `special` #

String representation for a TaskDependency contains both task IDs and an arrow.

Source code in snoop/data/models.py

def __str__(self):
    """String representation for a TaskDependency contains both task IDs
    and an arrow.
    """
    return f'{self.prev} -> {self.next}'

`str(self)` `special` #

String representation for a TaskDependency contains both task IDs and an arrow.

Source code in snoop/data/models.py

def __str__(self):
    """String representation for a TaskDependency contains both task IDs
    and an arrow.
    """
    return f'{self.prev} -> {self.next}'

`Thumbnail` #

Database model for storing the Thumbnail corresponding to a Digest.

Attributes#

`blob` #

Foreign Key to the original File's blob

`source` #

Foreign Key to the blob used for computation.

`thumbnail` #

Foreign Key to the corresponding thumbnail-blob.

Classes#

`SizeChoices` #

An enumeration.

Functions#

`blob_repo_path(sha3_256)` #

Returns a string pointing to the blob object for given hash.

Parameters:

Name	Type	Description	Default
`sha3_256`		hash used to compute the object path	required

Source code in snoop/data/models.py

def blob_repo_path(sha3_256):
    """Returns a string pointing to the blob object for given hash.

    Args:
        sha3_256: hash used to compute the object path
    """
    return sha3_256[:2] + '/' + sha3_256[2:4] + '/' + sha3_256[4:]

`chunks(file, blocksize = 65536)` #

Splits file into binary chunks of fixed size.

Parameters:

Name	Type	Description	Default
`file`		file-like object, already opened	required
`blocksize`		size, in bytes, of the byte strings yielded	`65536`

Source code in snoop/data/models.py

def chunks(file, blocksize=65536):
    """Splits file into binary chunks of fixed size.

    Args:
        file: file-like object, already opened
        blocksize: size, in bytes, of the byte strings yielded
    """
    while True:
        data = file.read(blocksize)
        if not data:
            return
        yield data

Models

snoop.data.models #

Classes#

Blob #

Attributes#

collection_source_key #

content_type property readonly #

date_created #

date_modified #

magic #

md5 #

mime_encoding #

mime_type #

sha1 #

sha256 #

sha3_256 #

size #

Methods#

__repr__(self) special #

__str__(self) special #

create(cls, fs_path = None) classmethod #

create_from_bytes(data) classmethod #

create_from_file(path, collection_source_key = None) classmethod #

create_json(data) classmethod #

mount_path(self) #

open(self, need_seek = False, need_fileno = False) #

read_json(self) #

update_magic(self) #

BlobWriter #

Methods#

__init__(self, file = None) special #

finish(self) #

write(self, chunk) #

Digest #

Attributes#

blob #

extra_result #

result #

Methods#

__repr__(self) special #

__str__(self) special #

get_etag(self) #

Directory #

Attributes#

container_file #

name property readonly #

name_bytes #

parent property readonly #

parent_directory #

path_str property readonly #

Methods#

__repr__(self) special #

__str__(self) special #

ancestry(item) #

root() classmethod #

DocumentUserTag #

Attributes#

blob property readonly #

date_indexed #

digest #

field property readonly #

public #

tag #

user #

uuid #

Methods#

delete(self, *args, **kwargs) #

save(self, *args, **kwargs) #

Entity #

Methods#

__repr__(self) special #

EntityHit #

Methods#

__repr__(self) special #

EntityType #

Methods#

__repr__(self) special #

File #

Attributes#

blob #

`snoop.data.models` #

`Blob` #

`collection_source_key` #

`content_type` `property` `readonly` #

`date_created` #

`date_modified` #

`magic` #

`md5` #

`mime_encoding` #

`mime_type` #

`sha1` #

`sha256` #

`sha3_256` #

`size` #

`repr(self)` `special` #

`str(self)` `special` #

`create(cls, fs_path = None)` `classmethod` #

`create_from_bytes(data)` `classmethod` #

`create_from_file(path, collection_source_key = None)` `classmethod` #

`create_json(data)` `classmethod` #

`mount_path(self)` #

`open(self, need_seek = False, need_fileno = False)` #

`read_json(self)` #

`update_magic(self)` #

`BlobWriter` #

`init(self, file = None)` `special` #

`finish(self)` #

`write(self, chunk)` #

`Digest` #

`blob` #

`extra_result` #

`result` #

`repr(self)` `special` #

`str(self)` `special` #

`get_etag(self)` #

`Directory` #

`container_file` #

`name` `property` `readonly` #

`name_bytes` #

`parent` `property` `readonly` #

`parent_directory` #

`path_str` `property` `readonly` #

`repr(self)` `special` #

`str(self)` `special` #

`ancestry(item)` #

`root()` `classmethod` #

`DocumentUserTag` #

`blob` `property` `readonly` #

`date_indexed` #

`digest` #

`field` `property` `readonly` #

`public` #

`tag` #

`user` #

`uuid` #

`delete(self, *args, **kwargs)` #

`save(self, *args, **kwargs)` #

`Entity` #

`repr(self)` `special` #

`EntityHit` #

`repr(self)` `special` #

`EntityType` #

`repr(self)` `special` #

`File` #

`blob` #

`ctime` #

`mtime` #

`name` `property` `readonly` #

`name_bytes` #

`original` #

`parent` `property` `readonly` #

`parent_directory` #

`size` #

`repr(self)` `special` #

`str(self)` `special` #

`LanguageModel` #

`repr(self)` `special` #

`OcrDocument` #

`ocr` #

`original_hash` #