Thumbnails
snoop.data.analyzers.thumbnails
#
Task that is calling a thumbnail generation service.
Three Thumnbails in different sizes are created. The service used can be found here: [[FPurchess/preview-service]].
Attributes#
MIN_SPEED_BPS
#
Minimum reference speed for this task. Saved as 10% of the Average Success
Speed in the Admin UI. The timeout is calculated using this value, the request
file size, and the previous TIMEOUT_BASE
constant.
THUMBNAIL_MIME_TYPES
#
List of mime types, that the thumbnail service supports. Based on [[github.com/algoo/preview-generator/blob/develop/doc/supported_mimetypes.rst]]
THUMBNAIL_TRUNCATE_FILE_SIZE
#
On files larger than this limit, truncate them when sending. This ensures thumbnail generation doesn't clog up our pipeline, instead preferring to fail after 50/300MB for huge PDFs/Words.
TIMEOUT_BASE
#
Minimum number of seconds to wait for this service.
TIMEOUT_MAX
#
Maximum number of seconds to wait for this service.
Functions#
call_thumbnails_service(blob, size)
#
Executes HTTP PUT request to Thumbnail service.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
|
the file for which a thumbnail will be created. |
required |
size |
|
the size for the created thumbnail (thumbnail will be size x size) |
required |
Source code in snoop/data/analyzers/thumbnails.py
def call_thumbnails_service(blob, size):
"""Executes HTTP PUT request to Thumbnail service.
Args:
data: the file for which a thumbnail will be created.
size: the size for the created thumbnail (thumbnail will be size x size)
"""
url = settings.SNOOP_THUMBNAIL_URL + f'preview/{size}x{size}'
actual_size = min(blob.size, THUMBNAIL_TRUNCATE_FILE_SIZE)
timeout = min(TIMEOUT_MAX, int(TIMEOUT_BASE + actual_size / MIN_SPEED_BPS))
# instead of streaming the file, just read some 50MB into a bytes string and send that, capping out
# the data sent per file for this very slow service.
with blob.open() as f:
data = utils.read_exactly(f, THUMBNAIL_TRUNCATE_FILE_SIZE)
payload = {'file': data}
try:
resp = requests.post(url, files=payload, timeout=timeout)
except Exception as e:
log.exception(e)
raise SnoopTaskBroken('timeout and/or connection error, timeout = ' + str(round(timeout)) + 's',
'thumbnail_timeout')
if (resp.status_code != 200
or resp.headers['Content-Type'] != 'image/jpeg'):
raise SnoopTaskBroken(resp.text, 'thumbnail_http_' + str(resp.status_code))
return resp.content
can_create(blob)
#
Checks if thumbnail generator service can process this mime type.
Source code in snoop/data/analyzers/thumbnails.py
def can_create(blob):
"""Checks if thumbnail generator service can process this mime type."""
if blob.mime_type in THUMBNAIL_MIME_TYPES and blob.size < THUMBNAIL_TRUNCATE_FILE_SIZE:
return True
return False
create_resized(size, thumbnail_large_blob, original_blob, source)
#
Utility function to create a resized thumbnail image.
Calls imagemagicks convert to do the resizing in memory and creates the thumbnail blob and thumbnail object in the database.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
size |
|
The maximum size (size x size) the thumbnail image should have (ratio is preserved). |
required |
thumbnail_large_blob |
|
blob of the original large thumbnail |
required |
original_blob |
|
original blob of the document |
required |
source |
|
either pdf_preview or blob |
required |
Source code in snoop/data/analyzers/thumbnails.py
def create_resized(size, thumbnail_large_blob, original_blob, source):
"""Utility function to create a resized thumbnail image.
Calls imagemagicks convert to do the resizing in memory and creates the
thumbnail blob and thumbnail object in the database.
Args:
size: The maximum size (size x size) the thumbnail image should have (ratio is preserved).
thumbnail_large_blob: blob of the original large thumbnail
original_blob: original blob of the document
source: either pdf_preview or blob
"""
with tempfile.NamedTemporaryFile(delete=False) as orig:
with thumbnail_large_blob.open() as f:
orig.write(f.read())
orig.flush()
orig.close()
thumbnail_bytes = subprocess.check_output(
['convert', f'jpg:{orig.name}', '-resize', f'{size}x{size}', 'jpg:-'])
os.remove(orig.name)
thumbnail_blob = models.Blob.create_from_bytes(thumbnail_bytes)
_, _ = models.Thumbnail.objects.update_or_create(
blob=original_blob,
size=size,
defaults={'thumbnail': thumbnail_blob, 'source': source})
return True
get_thumbnail(blob, pdf_preview = None)
#
Function that calls the thumbnail service for a given blob.
Gets the thumbnail in the largest resolution specified (in the thumbnail model) from the thumbnail service. Then creates the smaller thumbnails by resizing that image in memory.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
blob |
|
Original file that we need a thumbnail for |
required |
source |
|
If set, will use this data for the actual creation of the thumbnail. Useful if we have PDF conversions. |
required |
Source code in snoop/data/analyzers/thumbnails.py
@snoop_task('thumbnails.get_thumbnail', version=5, queue='thumbnails')
# the @returns_json_blob decorator is only needed to check if this function ran in digests.gather
@returns_json_blob
def get_thumbnail(blob, pdf_preview=None):
"""Function that calls the thumbnail service for a given blob.
Gets the thumbnail in the largest resolution specified (in the thumbnail model) from the
thumbnail service. Then creates the smaller thumbnails by resizing that image in memory.
Args:
blob: Original file that we need a thumbnail for
source: If set, will use this data for the actual creation of the thumbnail.
Useful if we have PDF conversions.
"""
if not current_collection().thumbnail_generator_enabled or not can_create(blob):
raise SnoopTaskBroken('thumbnail generator disabled', 'thumbnails_disabled')
if pdf_preview and isinstance(pdf_preview, models.Blob) and pdf_preview.size > 0:
source = pdf_preview
else:
source = blob
sizes = models.Thumbnail.SizeChoices.values
thumbnail_large_bytes = call_thumbnails_service(source, sizes.pop(sizes.index(max(sizes))))
thumbnail_large_blob = models.Blob.create_from_bytes(thumbnail_large_bytes)
_, _ = models.Thumbnail.objects.update_or_create(
blob=blob,
size=max(models.Thumbnail.SizeChoices.values),
defaults={'thumbnail': thumbnail_large_blob, 'source': source}
)
for size in sizes:
create_resized(size, thumbnail_large_blob, blob, source)
return True