Pdf preview
snoop.data.analyzers.pdf_preview
#
Task to call a service that creates pdf previews for various types of documents.
The service used can be found here: [[thecodingmachine/gotenberg]]
Attributes#
PDF_PREVIEW_EXTENSIONS
#
List of file extensions that the pdf generator supports. Based on [[gotenberg.dev/docs/modules/libreoffice]].
PDF_PREVIEW_MIME_TYPES
#
List of mime types that the pdf generator supports. Based on [[gotenberg.dev/docs/modules/libreoffice]].
PDF_PREVIEW_MIN_SPEED_BPS
#
Minimum reference speed for this task. Saved as 10% of the Average Success
Speed in the Admin UI. The timeout is calculated using this value, the request
file size, and the previous TIMEOUT_BASE
constant.
PDF_PREVIEW_TIMEOUT_BASE
#
Minimum number of seconds to wait for this service.
PDF_PREVIEW_TIMEOUT_MAX
#
Maximum number of seconds to wait for this service. For PDF preview we allow 2h.
Functions#
call_pdf_generator(data, filename, size)
#
Executes HTTP PUT request to the pdf generator service.
Source code in snoop/data/analyzers/pdf_preview.py
def call_pdf_generator(data, filename, size):
"""Executes HTTP PUT request to the pdf generator service."""
url = settings.SNOOP_PDF_PREVIEW_URL + 'forms/libreoffice/convert'
timeout = min(PDF_PREVIEW_TIMEOUT_MAX,
int(PDF_PREVIEW_TIMEOUT_BASE + size / PDF_PREVIEW_MIN_SPEED_BPS))
resp = requests.post(url, files={'files': (filename, data)}, timeout=timeout)
if resp.status_code == 504:
raise SnoopTaskBroken('pdf generator timed out and returned http 504', 'pdf_preview_http_504')
if (resp.status_code != 200
or resp.headers['Content-Type'] != 'application/pdf'):
raise SnoopTaskBroken(f'pdf generator returned unexpected response {resp}',
'pdf_preview_http_' + str(resp.status_code))
return resp.content
can_create(blob)
#
Checks if the pdf generator can process this file.
Source code in snoop/data/analyzers/pdf_preview.py
def can_create(blob):
"""Checks if the pdf generator can process this file."""
if blob.mime_type in PDF_PREVIEW_MIME_TYPES:
return True
get_pdf(blob)
#
Calls the pdf generator for a given blob.
Adds the pdf preview to the database
Source code in snoop/data/analyzers/pdf_preview.py
@snoop_task('pdf_preview.get_pdf', version=3, queue='pdf-preview')
def get_pdf(blob):
"""Calls the pdf generator for a given blob.
Adds the pdf preview to the database
"""
if not current_collection().pdf_preview_enabled \
or not can_create(blob):
raise SnoopTaskBroken('pdf preview disabled', 'pdf_preview_disabled')
# the service needs to receive a filename but the original filename might be broken
DEFAULT_FILENAME = 'a'
try:
filename = models.File.objects.filter(original=blob.pk)[0].name
_, ext = os.path.splitext(filename)
except IndexError:
log.warning('no File entry for this object!')
ext = None
if ext not in PDF_PREVIEW_EXTENSIONS:
ext = mimetypes.guess_extension(blob.mime_type)
if ext not in PDF_PREVIEW_EXTENSIONS:
raise SnoopTaskBroken('no valid file extension guessed', 'invalid_file_extension')
with blob.open() as f:
resp = call_pdf_generator(f, DEFAULT_FILENAME + ext, blob.size)
blob_pdf_preview = models.Blob.create_from_bytes(resp)
# create PDF object in pdf preview model
_, _ = models.PdfPreview.objects.update_or_create(
blob=blob,
defaults={'pdf_preview': blob_pdf_preview}
)
return blob_pdf_preview