Skip to content

Pdf tools

snoop.data.pdf_tools #

Helpers for working with PDF files (splitting into pages, fetching info, extracting text for UI Find tool)

Functions#

apply_pdf_tools(request, blob, max_size_before_stream) #

Apply processing to PDF files based on GET params.

Request GET params: - 'X-Hoover-PDF-Info' - if set, return page count, and a division of pages - 'X-Hoover-PDF-Split-Page-Range' - 'X-Hoover-PDF-Extract-Text'

Source code in snoop/data/pdf_tools.py
def apply_pdf_tools(request, blob, max_size_before_stream):
    """
    Apply processing to PDF files based on GET params.

    Request GET params:
        - 'X-Hoover-PDF-Info'
            - if set, return page count, and a division of pages
        - 'X-Hoover-PDF-Split-Page-Range'
        - 'X-Hoover-PDF-Extract-Text'
    """

    HEADER_RANGE = 'X-Hoover-PDF-Split-Page-Range'
    HEADER_PDF_INFO = 'X-Hoover-PDF-Info'
    HEADER_PDF_EXTRACT_TEXT = 'X-Hoover-PDF-Extract-Text'

    # pass over unrelated requests
    _get_info = request.GET.get(HEADER_PDF_INFO, '')
    _get_range = request.GET.get(HEADER_RANGE, '')
    _get_text = request.GET.get(HEADER_PDF_EXTRACT_TEXT, '')

    if (
        request.method != 'GET'
        or not (
            _get_info or _get_range or _get_text
        )
    ):
        return None

    if request.headers.get('Range'):
        log.warning('PDF Tools: Reject Range query')
        return HttpResponse('X-Hoover-PDF does not work with HTTP-Range', status=400)

    if (
        (_get_info and _get_range)
        or (_get_info and _get_text)
    ):
        log.warning('PDF Tools: Reject Bad Arguments')
        return HttpResponse('X-Hoover-PDF-Get-Info must be only arg', status=400)

    def _add_headers(response, content_type):
        response['Content-Type'] = content_type
        response[HEADER_PDF_INFO] = _get_info
        response[HEADER_RANGE] = _get_range
        response[HEADER_PDF_EXTRACT_TEXT] = _get_text
        return response
    with blob.mount_path() as blob_path, \
            NamedTemporaryFile(prefix='pdf-split') as split_file, \
            NamedTemporaryFile(prefix='pdf-text') as text_file:
        if _get_info:
            return JsonResponse(get_pdf_info(blob_path))

        # for very big PDFs >50MB, use lockfile so we don't OOM...
        blob_size_mb = blob.size / 2**20
        if blob_size_mb > 50:
            _func = _lock_get_range_or_text
        else:
            _func = _do_get_range_or_text

        return _func(
            blob.pk,
            blob_path,
            _get_range,
            split_file,
            _get_text,
            text_file,
            _add_headers,
            max_size_before_stream,
        )

get_pdf_info(path) #

streaming wrapper to extract pdf info json (page count, chunks)

Source code in snoop/data/pdf_tools.py
def get_pdf_info(path):
    """streaming wrapper to extract pdf info json (page count, chunks)"""
#    script = "export JAVA_TOOL_OPTIONS='-Xmx3g'; pdftk - dump_data | grep NumberOfPages | head -n1"
    # script = "pdfinfo -  | grep Pages | head -n1"
    script = f"qpdf --show-npages {path}"
    page_count = int(run_script(script).decode('ascii'))
    size_mb = round(os.stat(path).st_size / 2**20, 3)
    DESIRED_CHUNK_MB = 25
    chunk_count = max(1, int(math.ceil(size_mb / DESIRED_CHUNK_MB)))
    pages_per_chunk = int(math.ceil((page_count + 1) / chunk_count))
    pages_per_chunk = min(pages_per_chunk, MAX_PDF_PAGES_PER_CHUNK)
    expected_chunk_size_mb = round(size_mb / chunk_count, 3)
    chunks = []
    for i in range(0, chunk_count):
        a = 1 + i * pages_per_chunk
        b = a + pages_per_chunk - 1
        b = min(b, page_count)
        chunks.append(f'{a}-{b}')

    return {
        'size_mb': size_mb,
        'expected_chunk_size_mb': expected_chunk_size_mb,
        'page_count': page_count,
        'chunks': chunks,
    }

pdf_extract_text(infile, outfile) #

Extract pdf text using javascript.

Source code in snoop/data/pdf_tools.py
def pdf_extract_text(infile, outfile):
    """Extract pdf text using javascript."""
    script = f'/opt/hoover/snoop/pdf-tools/run.sh {infile} {outfile}'
    run_script(script)

run_script(script, timeout = '120s', kill = '130s') #

Call the script and return the stdout; add 2min timeout

Source code in snoop/data/pdf_tools.py
def run_script(script, timeout='120s', kill='130s'):
    """Call the script and return the stdout; add 2min timeout"""
    # vandalize script so we drop very long STDERR messages from the logs
    # qpdf is sometimes very spammy with content warnings
    with TemporaryDirectory(prefix='pdf-tools-pwd-') as pwd:
        # script = script + ' 2> >(head -c2000 >&2)'
        script = f'cd {pwd}; ' + script
        cmd = ['/usr/bin/timeout', '-k', kill, timeout, '/bin/bash', '-exo', 'pipefail', '-c', script]
        log.warning('+ %s', script)
        return subprocess.check_output(cmd, cwd=pwd)

split_pdf_file(path, _range, dest_path) #

streaming wrapper to split pdf file into a page range.

Source code in snoop/data/pdf_tools.py
def split_pdf_file(path, _range, dest_path):
    """streaming wrapper to split pdf file into a page range."""
    script = (
        " qpdf --empty --no-warn --warning-exit-0 --deterministic-id "
        " --object-streams=generate  --remove-unreferenced-resources=yes "
        " --no-original-object-ids "
        f" --pages {path} {_range}  -- {dest_path}"
    )
    run_script(script)