Skip to content

snoop.data.views #

Django views, mostly JSON APIs.

Attributes#

DOWNLOAD_CACHE_MAX_AGE #

cache all 'downloadable' documents on server for a week - these will never change

MAX_CACHE_ITEM_SIZE #

The biggest object to be put in cache (and loaded in memory, since Django doesn't support streaming from caches). Should be max 100-200MB.

SHORT_LIVED_CACHE_OPTIONS #

Cache-Control options to revalidate every 30-60s, for things that can lag behind, e.g. collection stats, task processing status - useful to decrease load

Classes#

TagViewSet #

Django Rest Framework (DRF) View set for the Tags APIs.

This is responsible for: capturing the various URL path arguments as the viewset context; setting the current collection with drf_collection_view(); restricting private Tags access to correct users.

Classes#
serializer_class #

Serializer for the Tags API.

Combines fields from the table with other fields from the URL path. Since this URL path is private between the backend services, we use it to store pertinent information too (e.g. user interacting with tags).

All fields are read-only except "public" and "tag". The UI doesn't edit "tag", so we may remove editing it in the future.

Classes#
Meta #

Configure the serializer model, fields and read only fields.

Classes#
model #

Table used to store tags made by users.

Both private and public tags are stored here.

Private tags are stored on separate Elasticsearch fields, one field per user. Tags are referenced both by usernames and user UUIDs, since we can't use usernames as parts of the elasticsearch field name (since they can contain characters like dot '.' that cannot be part of a field name).

Attributes#
blob property readonly #

Returns the Blob containing the document for this tag.

date_indexed #

Moment when document containing this tag was re-indexed.

digest #

Document being tagged.

field property readonly #

Returns the elasticsearch field name for this tag.

public #

Boolean that decides type of tag

tag #

String with the actual tag.

user #

Username, as string (to send back in the API).

uuid #

Unique identifier for user, used in elasticsearch field name.

Methods#
delete(self, *args, **kwargs) #

Override for re-indexing document targeted by this tag.

Source code in snoop/data/views.py
def delete(self, *args, **kwargs):
    """Override for re-indexing document targeted by this tag.
    """

    super().delete(*args, **kwargs)

    from . import digests
    digests.retry_index(self.blob)
save(self, *args, **kwargs) #

Override for re-indexing document targeted by this tag.

Source code in snoop/data/views.py
def save(self, *args, **kwargs):
    """Override for re-indexing document targeted by this tag.
    """

    self.check_tag_name()

    self.date_indexed = None
    super().save(*args, **kwargs)

    from . import digests
    digests.retry_index(self.blob)
Methods#
create(self, validated_data) #

Get additional fields from context when creating object.

See snoop.data.views.TagViewSet.get_serializer.

Source code in snoop/data/views.py
def create(self, validated_data):
    """Get additional fields from context when creating object.

    See [snoop.data.views.TagViewSet.get_serializer][].
    """

    data = dict(validated_data)
    data['user'] = self.context['user']
    data['uuid'] = self.context['uuid']
    data['digest_id'] = self.context['digest_id']
    return super().create(data)
update(self, instance, validated_data) #

Get additional fields from context when updating object.

See snoop.data.views.TagViewSet.get_serializer.

Source code in snoop/data/views.py
def update(self, instance, validated_data):
    """Get additional fields from context when updating object.

    See [snoop.data.views.TagViewSet.get_serializer][].
    """

    data = dict(validated_data)
    data['user'] = self.context['user']
    data['uuid'] = self.context['uuid']
    data['digest_id'] = self.context['digest_id']
    return super().update(instance, data)
Methods#
check_ownership(self, pk) #

Raises error if tag does not belong to current user.

To be used when doing write operations.

Source code in snoop/data/views.py
def check_ownership(self, pk):
    """Raises error if tag does not belong to current user.

    To be used when doing write operations.
    """
    assert self.kwargs['username'] == self.get_queryset().get(pk=pk).user, \
        "you can only modify your own tags"
destroy(self, request, pk = None, **kwargs) #

Collection-aware overload that also checks permission to write tag.

Source code in snoop/data/views.py
@drf_collection_view
def destroy(self, request, pk=None, **kwargs):
    """Collection-aware overload that also checks permission to write tag."""
    self.check_ownership(pk)
    return super().destroy(request, pk, **kwargs)
dispatch(self, *args, **kwargs) #

Collection-aware overload.

Source code in snoop/data/views.py
@drf_collection_view
def dispatch(self, *args, **kwargs):
    """Collection-aware overload."""
    return super().dispatch(*args, **kwargs)
get_queryset(self) #

Sets this TagViewSet's queryset to tags that are private to the current user, or that are public.

Source code in snoop/data/views.py
@drf_collection_view
def get_queryset(self):
    """Sets this TagViewSet's queryset to tags that are private to the current user,
    or that are public.
    """

    user = self.kwargs['username']
    blob = self.kwargs['hash']

    # let queryset return empty list
    # assert models.Digest.objects.filter(blob=blob).exists(), 'hash is not digest'

    return models.DocumentUserTag.objects.filter(Q(user=user) | Q(public=True), Q(digest__blob=blob))
get_serializer(self, *args, **kwargs) #

Set a context with the path arguments.

Generates fake values when instantiated by Swagger.

Source code in snoop/data/views.py
@drf_collection_view
def get_serializer(self, *args, **kwargs):
    """Set a context with the path arguments.

    Generates fake values when instantiated by Swagger.
    """
    fake = getattr(self, 'swagger_fake_view', False)
    if fake:
        context = {
            'collection': "some-collection",
            'blob': "0006660000000000000000000000000000000000000000000000000000000000",
            'user': "testuser",
            'digest_id': 666,
            'uuid': 'invalid',
        }
    else:
        try:
            digest_id = models.Digest.objects.filter(blob=self.kwargs['hash']).get().id
        except models.Digest.DoesNotExist:
            digest_id = None

        context = {
            'collection': self.kwargs['collection'],
            'blob': self.kwargs['hash'],
            'user': self.kwargs['username'],
            'digest_id': digest_id,
            'uuid': self.kwargs['uuid'],
        }
    return super().get_serializer(*args, **kwargs, context=context)
partial_update(self, request, pk = None, **kwargs) #

Collection-aware overload that also checks permission to write tag.

Source code in snoop/data/views.py
@drf_collection_view
def partial_update(self, request, pk=None, **kwargs):
    """Collection-aware overload that also checks permission to write tag."""
    self.check_ownership(pk)
    return super().partial_update(request, pk, **kwargs)
update(self, request, pk = None, **kwargs) #

Collection-aware overload that also checks permission to write tag.

Source code in snoop/data/views.py
@drf_collection_view
def update(self, request, pk=None, **kwargs):
    """Collection-aware overload that also checks permission to write tag."""
    self.check_ownership(pk)
    return super().update(request, pk, **kwargs)

Functions#

collection(request) #

View returns basic stats for a collection as JSON.

Also loads the "stats" for this collection, as saved by snoop.data.admin.get_stats.

Source code in snoop/data/views.py
@collection_view
@cache_control(**SHORT_LIVED_CACHE_OPTIONS)
def collection(request):
    """View returns basic stats for a collection as JSON.

    Also loads the "stats" for this collection, as saved by `snoop.data.admin.get_stats`.
    """

    col = collections.current()
    stats, _ = models.Statistics.objects.get_or_create(key='stats')
    return JsonResponse({
        'name': col.name,
        'title': col.name,
        'description': col.name,
        'feed': 'feed',
        'data_urls': '{id}/json',
        'stats': {k: v for k, v in stats.value.items() if not k.startswith('_')},
        'max_result_window': col.max_result_window,
        'refresh_interval': col.refresh_interval,
    })

collection_view(func) #

Decorator for views Django bound to a collection.

The collection slug is set through an URL path parameter called "collection".

Source code in snoop/data/views.py
def collection_view(func):
    """Decorator for views Django bound to a collection.

    The collection slug is set through an URL path parameter called "collection".
    """

    @tracer.wrap_function()
    @wraps(func)
    def view(request, *args, collection, **kwargs):
        try:
            col = collections.ALL[collection]
        except KeyError:
            raise Http404(f"Collection {collection} does not exist")

        with col.set_current():
            tracer.count('api_collection_view')
            return func(request, *args, **kwargs)

    return view

condition_cache(etag_func = None, last_modified_func = None, max_delay = 0, cache_content_age = 604800, version = 1) #

Copypasta of django.views.decorators.http.condition, but augumented to also correctly do server-side caching.

The generated etag, last-modified and request-generated metadata are combined into a single cache key to cache the response -- but only if the status code is 200.

The etag and last-modified headers are optionally cached for a set amount of time, to limit the frequency of running etag_func and last_modified_func. This causes a delay in getting the latest content.

The content is cached in conditional_view_content. The etag and last-modified are cached in conditional_view_etag.

This system does not handle cache invalidation; outdated content is left to expire after cache_content_age.

Source code in snoop/data/views.py
def condition_cache(
    etag_func=None, last_modified_func=None,
    max_delay=0,
    cache_content_age=DOWNLOAD_CACHE_MAX_AGE,
    version=CACHE_VERSION,
):
    """
    Copypasta of `django.views.decorators.http.condition`, but augumented to
    also correctly do server-side caching.

    The generated etag, last-modified and request-generated metadata are
    combined into a single cache key to cache the response -- but only if the
    status code is 200.

    The etag and last-modified headers are optionally cached for a set amount
    of time, to limit the frequency of running `etag_func` and
    `last_modified_func`. This causes a delay in getting the latest content.

    The content is cached in `conditional_view_content`. The etag and
    last-modified are cached in `conditional_view_etag`.

    This system does not handle cache invalidation; outdated content is left to
    expire after `cache_content_age`.
    """
    import datetime
    from django.utils.http import http_date, quote_etag
    from django.utils.cache import get_conditional_response
    from django.utils import timezone
    cache_etag = django_caches['conditional_view_etag']
    cache_content = django_caches['conditional_view_content']
    cache_control_opt = dict(
        private=True,
        must_revalidate=True,
        max_age=max_delay or 0,
    )
    assert etag_func or last_modified_func, 'no function given'

    def decorator(func):
        def _pre_process_request(request, *args, **kwargs):
            key_last_modified = _make_cache_key(request, 'last-modified', version)
            key_etag = _make_cache_key(request, 'etag', version)

            res_last_modified = None
            if last_modified_func:
                # Edit: get last modified from cache
                if max_delay:
                    res_last_modified = cache_etag.get(key_last_modified)
                # Original: compute last modified
                if not res_last_modified:
                    if dt := last_modified_func(request, *args, **kwargs):
                        if not timezone.is_aware(dt):
                            dt = timezone.make_aware(dt, datetime.timezone.utc)
                        res_last_modified = int(dt.timestamp())
                # Edit: put last modified in cache
                if res_last_modified and max_delay:
                    cache_etag.add(key_last_modified, res_last_modified, timeout=max_delay)

            # Edit: get etag from cache
            res_etag = None
            if max_delay:
                res_etag = cache_etag.get(key_etag)

            # Original: compute etag
            if not res_etag:
                res_etag = etag_func(request, *args, **kwargs) if etag_func else None
                res_etag = quote_etag(res_etag) if res_etag is not None else None

            # Edit: put etag in cache
            if res_etag and max_delay:
                cache_etag.add(key_etag, res_etag, timeout=max_delay)

            # Original: get conditional response (304 Not Modified & friends)
            if response := get_conditional_response(
                request,
                etag=res_etag,
                last_modified=res_last_modified,
            ):
                return response, res_etag, res_last_modified

            # Edit: fetch response from cache
            key_content = _make_cache_key(request, res_etag, res_last_modified, version)
            if response := cache_content.get(key_content):
                log.warning('CONDITION CACHE HIT: %s', key_content)
                return response, res_etag, res_last_modified

            # Original: compute response (not conditional, not cached)
            if response is None:
                t0 = time.time()
                log.warning('CONDITION CACHE MISS: %s', key_content)
                response = func(request, *args, **kwargs)
                # Edit: put compute time on the request
                dt = time.time() - t0
                dt_ms = 1 + int(dt * 1000)
                response['X-Hoover-Request-Handle-Duration-ms'] = str(dt_ms)

            # Edit: put response in cache
            if (
                response is not None
                and not response.streaming
                and 0 < len(response.content) <= MAX_CACHE_ITEM_SIZE
                and 200 <= response.status_code < 300
            ):
                log.warning('CONDITION CACHE ADD: %s', key_content)
                cache_content.add(key_content, response, timeout=cache_content_age)
            else:
                content_len = (
                    len(response.content)
                    if (response and not response.streaming)
                    else 'unknown'
                )
                log.warning(
                    'CONDITION CACHE REJECT: %s resp=%s streaming=%s len=%s status=%s',
                    key_content, response,
                    response.streaming,
                    content_len,
                    response.status_code,
                )

            return response, res_etag, res_last_modified

        def _post_process_request(request, response, res_etag, res_last_modified):
            # Set relevant headers on the response if they don't already exist
            # and if the request method is safe.
            if request.method in ("GET", "HEAD"):
                if res_last_modified and not response.has_header("Last-Modified"):
                    response.headers["Last-Modified"] = http_date(res_last_modified)
                if res_etag:
                    response.headers.setdefault("ETag", res_etag)

        @vary_on_headers(*CACHE_VARY_ON_HEADERS)
        @cache_control(**cache_control_opt)
        @wraps(func)
        def inner(request, *args, **kwargs):
            response, res_etag, res_last_modified = _pre_process_request(
                request, *args, **kwargs
            )
            if response is None:
                response = func(request, *args, **kwargs)
            _post_process_request(request, response, res_etag, res_last_modified)
            return response

        return inner

    return decorator

directory_last_modified(request, pk, *_args, **_kw) #

Get the last modified ts of either this Dir obj or any of its children

Source code in snoop/data/views.py
def directory_last_modified(request, pk, *_args, **_kw):
    """Get the last modified ts of either this Dir obj or any of its children"""
    directory = get_object_or_404(models.Directory.objects, pk=pk)
    doc_ts = directory.date_modified
    if directory.child_directory_set.exists():
        children_ts = directory.child_directory_set.aggregate(maxval=Max('date_modified'))['maxval']
        doc_ts = max(children_ts, doc_ts)
    if directory.child_file_set.exists():
        children_ts = directory.child_file_set.aggregate(maxval=Max('date_modified'))['maxval']
        doc_ts = max(children_ts, doc_ts)
    return doc_ts

document(request, hash) #

JSON view with data for a Digest.

The hash of the Digest source object is used to fetch it. If a Digest object doesn't exist, that means processing has failed and we need to fetch the File for metadata.

These are the de-duplicated variants of the objects returned from file_view() above, with some differences. See snoop.data.digests.get_document_data() versus snoop.data.digests.get_file_data().

Source code in snoop/data/views.py
@collection_view
@condition_cache(last_modified_func=document_digest_last_modified,
                 etag_func=document_digest_etag_key, max_delay=30)
def document(request, hash):
    """JSON view with data for a Digest.

    The hash of the Digest source object is used to fetch it. If a Digest object doesn't exist, that means
    processing has failed and we need to fetch the File for metadata.

    These are the de-duplicated variants of the objects returned from `file_view()` above, with some
    differences. See `snoop.data.digests.get_document_data()` versus `snoop.data.digests.get_file_data()`.
    """

    blob = models.Blob.objects.get(pk=hash)
    children_page = int(request.GET.get('children_page', 1))
    return JsonResponse(trim_text(digests.get_document_data(blob, children_page)))

document_download(request, hash, filename) #

View to download the .original Blob for the first File in a Digest's set.

Since all post-conversion .blobs are bound to the same Digest object, we assume the .original Blobs are all equal too; so we present only the first one for downloading.

HTML files have special treatment (we remove the unsafe tags) - because of the risk of offensive tracking scripts inside that could call out.

Source code in snoop/data/views.py
@collection_view
@condition_cache(last_modified_func=document_digest_last_modified,
                 etag_func=document_digest_etag_key, max_delay=300)
def document_download(request, hash, filename):
    """View to download the `.original` Blob for the first File in a Digest's set.

    Since all post-conversion `.blob`s are bound to the same `Digest` object, we assume the `.original`
    Blobs are all equal too; so we present only the first one for downloading.

    HTML files have special treatment (we remove the unsafe tags) - because of the risk of offensive
    tracking scripts inside that could call out.
    """

    digest = get_object_or_404(
        models.Digest.objects.only('blob'),
        blob__pk=hash,
    )
    first_file = digest.blob.file_set.first()
    blob = first_file.original

    if html.is_html(blob):
        clean_html = html.clean(blob)
        return HttpResponse(clean_html, content_type='text/html')

    real_filename = first_file.name_bytes.tobytes().decode('utf-8', errors='replace')
    real_filename = real_filename.replace("\r", "").replace("\n", "")

    return _get_http_response_for_blob(request, blob, real_filename)

document_locations(request, hash) #

JSON view to paginate through all locations for a Digest.

Used to browse between the different apparitions of a File in a dataset.

Paginated by integers with fixed length pages, starting from 1.

Source code in snoop/data/views.py
@collection_view
@condition_cache(last_modified_func=document_digest_last_modified,
                 etag_func=document_digest_etag_key, max_delay=300)
def document_locations(request, hash):
    """JSON view to paginate through all locations for a Digest.

    Used to browse between the different apparitions of a File in a dataset.

    Paginated by integers with fixed length pages, starting from 1.
    """

    digest = get_object_or_404(models.Digest.objects, blob__pk=hash)
    page = int(request.GET.get('page', 1))
    locations, has_next = digests.get_document_locations(digest, page)
    return JsonResponse({'locations': locations, 'page': page, 'has_next_page': has_next})

document_ocr(request, hash, ocrname) #

View to download the OCR result binary for a given Document and OCR source combination.

The file downloaded can either be a PDF document with selectable text imprinted in it, or a text file.

The OCR source can be either External OCR (added by management command snoop.data.management.commands.createocrsource or through the Admin), or managed internally (with the slug called tesseract_$LANG).

The given slug "ocrname" is first looked up in the snoop.data.models.OcrSource table. If it's not there, then we look in the Tasks table for dependencies of this document's Digest task, and return the one with name matching the slug.

Source code in snoop/data/views.py
@collection_view
@condition_cache(last_modified_func=document_digest_last_modified,
                 etag_func=document_digest_etag_key, max_delay=300)
def document_ocr(request, hash, ocrname):
    """View to download the OCR result binary for a given Document and OCR source combination.

    The file downloaded can either be a PDF document with selectable text imprinted in it, or a text file.

    The OCR source can be either External OCR (added by management command
    `snoop.data.management.commands.createocrsource` or through the Admin), or managed internally (with the
    slug called `tesseract_$LANG`).

    The given slug "ocrname" is first looked up in the `snoop.data.models.OcrSource` table. If it's not
    there, then we look in the Tasks table for dependencies of this document's Digest task, and return the
    one with name matching the slug.
    """

    digest = get_object_or_404(models.Digest.objects, blob__pk=hash)

    if models.OcrSource.objects.filter(name=ocrname).exists():
        # serve file from external OCR import
        ocr_source = get_object_or_404(models.OcrSource, name=ocrname)
        ocr_queryset = ocr.ocr_documents_for_blob(digest.blob)
        ocr_document = get_object_or_404(ocr_queryset, source=ocr_source)

        blob = ocr_document.ocr
    else:
        digest_task = get_object_or_404(models.Task.objects, func='digests.gather', args=[hash])
        tesseract_task = digest_task.prev_set.get(name=ocrname).prev
        blob = tesseract_task.result

    return _get_http_response_for_blob(request, blob)

drf_collection_view(func) #

Decorator for Django Rest Framework viewset methods bound to a collection.

The collection slug is set through the kwargs field on the rest_framework.viewsets.ModelViewSet called "collection". The kwargs are set by Django Rest Framework from the URL path parameter, so result is similar to snoop.data.views.collection_view() defined above.

Source code in snoop/data/views.py
def drf_collection_view(func):
    """Decorator for Django Rest Framework viewset methods bound to a collection.

    The collection slug is set through the `kwargs` field on the `rest_framework.viewsets.ModelViewSet`
    called "collection". The `kwargs` are set by Django Rest Framework from the URL path parameter, so
    result is similar to `snoop.data.views.collection_view() defined above`.
    """

    @tracer.wrap_function()
    @wraps(func)
    def view(self, *args, **kwargs):
        try:
            collection = self.kwargs['collection']
            col = collections.ALL[collection]
        except KeyError:
            raise Http404("Collection does not exist")

        with col.set_current():
            tracer.count('api_collection_view')
            return func(self, *args, **kwargs)

    return view

feed(request) #

JSON view used to paginate through entire Digest database, sorted by last modification date.

This was used in the past by another service to pull documents as they are processed and index them elsewhere. This is not used anymore by us, since we now index documents in a snoop Task. See snoop.data.digests.index for the Task definition.

TODO: deprecate and remove this view.

Source code in snoop/data/views.py
@collection_view
@never_cache
def feed(request):
    """JSON view used to paginate through entire Digest database, sorted by last modification date.

    This was used in the past by another service to pull documents as they are processed and index them
    elsewhere. This is not used anymore by us, since we now index documents in a snoop Task. See
    `snoop.data.digests.index` for the Task definition.

    TODO: deprecate and remove this view.
    """
    limit = settings.SNOOP_FEED_PAGE_SIZE
    query = models.Digest.objects.order_by('-date_modified')

    lt = request.GET.get('lt')
    if lt:
        query = query.filter(date_modified__lt=lt)

    documents = [digests.get_document_data(d.blob) for d in query[:limit]]

    if len(documents) < limit:
        next_page = None

    else:
        last_version = documents[-1]['version']
        next_page = f'?lt={last_version}'

    return JsonResponse({
        'documents': documents,
        'next': next_page,
    })

file_digest_last_modified(request, pk, *_args, **_kw) #

Get the last modified ts of either this File obj or any of its children

Source code in snoop/data/views.py
def file_digest_last_modified(request, pk, *_args, **_kw):
    """Get the last modified ts of either this File obj or any of its children"""
    file = get_object_or_404(models.File.objects, pk=pk)
    try:
        doc_ts = file.blob.digest.date_modified
    except models.Blob.digest.RelatedObjectDoesNotExist:
        doc_ts = file.date_modified

    if file.child_directory_set.exists():
        children_ts = file.child_directory_set.aggregate(maxval=Max('date_modified'))['maxval']
        doc_ts = max(children_ts, doc_ts)

    return doc_ts

file_exists(request, directory_pk, filename) #

View that checks if a given file exists in the database.

Source code in snoop/data/views.py
@collection_view
@condition_cache(last_modified_func=directory_last_modified, max_delay=300)
def file_exists(request, directory_pk, filename):
    """View that checks if a given file exists in the database. """
    try:
        file = models.File.objects.get(
            name_bytes=str.encode(filename),
            parent_directory__pk=directory_pk)
    except models.File.DoesNotExist:
        return HttpResponse(status=404)
    if file:
        return HttpResponse(file.original.pk)

file_view(request, pk) #

JSON view with data for a File.

The primary key of the File is used to fetch it. Response is different from, but very similar to, the result of the document() view below.

Source code in snoop/data/views.py
@collection_view
@condition_cache(last_modified_func=file_digest_last_modified, max_delay=120)
def file_view(request, pk):
    """JSON view with data for a File.

    The primary key of the File is used to fetch it.
    Response is different from, but very similar to, the result of the `document()` view below.
    """

    file = get_object_or_404(models.File.objects, pk=pk)
    children_page = int(request.GET.get('children_page', 1))
    return JsonResponse(trim_text(digests.get_file_data(file, children_page)))

get_path(request, directory_pk) #

Get the full path of a given directory

Source code in snoop/data/views.py
@collection_view
@condition_cache(last_modified_func=directory_last_modified, max_delay=300)
def get_path(request, directory_pk):
    """Get the full path of a given directory"""
    directory = models.Directory.objects.get(pk=directory_pk)
    # check if there is a container file in the path
    for ancestor in directory.ancestry():
        if ancestor.container_file:
            return HttpResponse(status=404)
    return HttpResponse(str(directory))

processing_status(request, hash) #

View that checks the processing status of a given blob.

Searches for tasks related to the given blob and filters all unfinished tasks (pending, started or deferred). If there are no unfinished tasks the blob has been processed.

Parameters:

Name Type Description Default
hash

Primary key of the blob to be checked.

required

Returns:

Type Description

A HTTP 200 response if the blob has been processed completely.

A HTTP 404 response if there are unfinished tasks.

Source code in snoop/data/views.py
@collection_view
@cache_control(**SHORT_LIVED_CACHE_OPTIONS)
def processing_status(request, hash):
    """View that checks the processing status of a given blob.

    Searches for tasks related to the given blob and filters all unfinished tasks
    (pending, started or deferred). If there are no unfinished tasks the blob has been
    processed.
    Args:
        hash: Primary key of the blob to be checked.

    Returns:
        A HTTP 200 response if the blob has been processed completely.

        A HTTP 404 response if there are unfinished tasks.
    """
    result = {'finished': False, 'done_count': 0, 'total_count': 0}
    total_tasks = models.Task.objects.filter(blob_arg__pk=hash)
    done_tasks = total_tasks.filter(Q(status='success')
                                    | Q(status='error')
                                    | Q(status='broken')
                                    )
    result['done_count'] = done_tasks.count()
    result['total_count'] = total_tasks.count()
    total_count = result['total_count']
    if total_count != 0 and result['done_count'] == total_count:
        result['finished'] = True
    return JsonResponse(result)

rescan_directory(request, directory_pk) #

Start a filesystem walk in the given directory.

Source code in snoop/data/views.py
@collection_view
@never_cache
def rescan_directory(request, directory_pk):
    """Start a filesystem walk in the given directory."""
    dispatch_directory_walk_tasks(directory_pk)
    return HttpResponse(status=200)

trim_text(data) #

Trim the text fields to TEXT_LIMIT chars

Source code in snoop/data/views.py
def trim_text(data):
    """ Trim the text fields to TEXT_LIMIT chars """
    if not data.get('content'):
        return data

    text = data['content'].get('text')

    # For images and the like, text is None.
    if not text:
        return data

    if len(text) > TEXT_LIMIT:
        text = text[:TEXT_LIMIT] + "\n\n=== Long text trimmed by Hoover ===\n"
    data['content']['text'] = text
    return data