Pdf tools
snoop.data.pdf_tools
#
Helpers for working with PDF files (splitting into pages, fetching info, extracting text for UI Find tool)
Functions#
apply_pdf_tools(request, blob, max_size_before_stream)
#
Apply processing to PDF files based on GET params.
Request GET params: - 'X-Hoover-PDF-Info' - if set, return page count, and a division of pages - 'X-Hoover-PDF-Split-Page-Range' - 'X-Hoover-PDF-Extract-Text'
Source code in snoop/data/pdf_tools.py
def apply_pdf_tools(request, blob, max_size_before_stream):
"""
Apply processing to PDF files based on GET params.
Request GET params:
- 'X-Hoover-PDF-Info'
- if set, return page count, and a division of pages
- 'X-Hoover-PDF-Split-Page-Range'
- 'X-Hoover-PDF-Extract-Text'
"""
HEADER_RANGE = 'X-Hoover-PDF-Split-Page-Range'
HEADER_PDF_INFO = 'X-Hoover-PDF-Info'
HEADER_PDF_EXTRACT_TEXT = 'X-Hoover-PDF-Extract-Text'
# pass over unrelated requests
_get_info = request.GET.get(HEADER_PDF_INFO, '')
_get_range = request.GET.get(HEADER_RANGE, '')
_get_text = request.GET.get(HEADER_PDF_EXTRACT_TEXT, '')
if (
request.method != 'GET'
or not (
_get_info or _get_range or _get_text
)
):
return None
if request.headers.get('Range'):
log.warning('PDF Tools: Reject Range query')
return HttpResponse('X-Hoover-PDF does not work with HTTP-Range', status=400)
if (
(_get_info and _get_range)
or (_get_info and _get_text)
):
log.warning('PDF Tools: Reject Bad Arguments')
return HttpResponse('X-Hoover-PDF-Get-Info must be only arg', status=400)
def _add_headers(response, content_type):
response['Content-Type'] = content_type
response[HEADER_PDF_INFO] = _get_info
response[HEADER_RANGE] = _get_range
response[HEADER_PDF_EXTRACT_TEXT] = _get_text
return response
with blob.mount_path() as blob_path, \
NamedTemporaryFile(prefix='pdf-split') as split_file, \
NamedTemporaryFile(prefix='pdf-text') as text_file:
if _get_info:
return JsonResponse(get_pdf_info(blob_path))
# for very big PDFs >50MB, use lockfile so we don't OOM...
blob_size_mb = blob.size / 2**20
if blob_size_mb > 50:
_func = _lock_get_range_or_text
else:
_func = _do_get_range_or_text
return _func(
blob.pk,
blob_path,
_get_range,
split_file,
_get_text,
text_file,
_add_headers,
max_size_before_stream,
)
get_pdf_info(path)
#
streaming wrapper to extract pdf info json (page count, chunks)
Source code in snoop/data/pdf_tools.py
def get_pdf_info(path):
"""streaming wrapper to extract pdf info json (page count, chunks)"""
# script = "export JAVA_TOOL_OPTIONS='-Xmx3g'; pdftk - dump_data | grep NumberOfPages | head -n1"
# script = "pdfinfo - | grep Pages | head -n1"
script = f"qpdf --show-npages {path}"
page_count = int(run_script(script).decode('ascii'))
size_mb = round(os.stat(path).st_size / 2**20, 3)
DESIRED_CHUNK_MB = 25
chunk_count = max(1, int(math.ceil(size_mb / DESIRED_CHUNK_MB)))
pages_per_chunk = int(math.ceil((page_count + 1) / chunk_count))
pages_per_chunk = min(pages_per_chunk, MAX_PDF_PAGES_PER_CHUNK)
expected_chunk_size_mb = round(size_mb / chunk_count, 3)
chunks = []
for i in range(0, chunk_count):
a = 1 + i * pages_per_chunk
b = a + pages_per_chunk - 1
b = min(b, page_count)
chunks.append(f'{a}-{b}')
return {
'size_mb': size_mb,
'expected_chunk_size_mb': expected_chunk_size_mb,
'page_count': page_count,
'chunks': chunks,
}
pdf_extract_text(infile, outfile)
#
Extract pdf text using javascript.
Source code in snoop/data/pdf_tools.py
def pdf_extract_text(infile, outfile):
"""Extract pdf text using javascript."""
script = f'/opt/hoover/snoop/pdf-tools/run.sh {infile} {outfile}'
run_script(script)
run_script(script, timeout = '120s', kill = '130s')
#
Call the script and return the stdout; add 2min timeout
Source code in snoop/data/pdf_tools.py
def run_script(script, timeout='120s', kill='130s'):
"""Call the script and return the stdout; add 2min timeout"""
# vandalize script so we drop very long STDERR messages from the logs
# qpdf is sometimes very spammy with content warnings
with TemporaryDirectory(prefix='pdf-tools-pwd-') as pwd:
# script = script + ' 2> >(head -c2000 >&2)'
script = f'cd {pwd}; ' + script
cmd = ['/usr/bin/timeout', '-k', kill, timeout, '/bin/bash', '-exo', 'pipefail', '-c', script]
log.warning('+ %s', script)
return subprocess.check_output(cmd, cwd=pwd)
split_pdf_file(path, _range, dest_path)
#
streaming wrapper to split pdf file into a page range.
Source code in snoop/data/pdf_tools.py
def split_pdf_file(path, _range, dest_path):
"""streaming wrapper to split pdf file into a page range."""
script = (
" qpdf --empty --no-warn --warning-exit-0 --deterministic-id "
" --object-streams=generate --remove-unreferenced-resources=yes "
" --no-original-object-ids "
f" --pages {path} {_range} -- {dest_path}"
)
run_script(script)