Magic
snoop.data.magic
#
Guess mime types from content and filename.
Uses the file
executable (libmagic) to guess the mime type, even if the extension is incorrect.
In some cases, the correct mime type is only discovered when the extension is present. For example, all
".docx" and "xlsx" and similar ".***x" Microsoft Office files are actually zips with XMLs inside - so
impossible for file
to differentiate from the content alone, without implementing decompression too.
Last, we have our own additions to this system, in order to try and differentiate between some ambiguous
cases even find
doesn't take into account; such as the difference between a single E-mail file and a MBOX
collection.
Classes#
Magic
#
Wrapper for running various "file" commands over Blobs.
Used internally when creating snoop.data.models.Blob
instances.
Functions#
looks_like_email(path)
#
Improvised check to detect RFC 822 emails.
Will look for usual headers in the first 64K of the file.
Needed because emails are sometimes detected by libmagic
as text or something else.
Source code in snoop/data/magic.py
def looks_like_email(path):
"""Improvised check to detect RFC 822 emails.
Will look for usual headers in the first 64K of the file.
Needed because emails are sometimes detected by `libmagic` as text or something else.
"""
HEADER_SET = {
"Relay-Version", "Return-Path", "From", "To",
"Received", "Message-Id", "Date", "In-Reply-To", "Subject",
}
HEADER_MIN_HIT_COUNT = 2
HEADER_READ_SIZE = 1024 * 64
with path.open('rb') as f:
content = read_exactly(f, HEADER_READ_SIZE).decode('latin-1')
headers_found = set([
s.split(':')[0].strip().title()
for s in content.splitlines()
if ':' in s
])
return len(headers_found.intersection(HEADER_SET)) >= HEADER_MIN_HIT_COUNT
looks_like_emlx_email(path)
#
Improvised check to detect Apple format emails.
Warning
Only looks at the first byte of the first line of the Apple-specific prefix. We probably want to revisit this and check the remainder of the email message too.
Source code in snoop/data/magic.py
def looks_like_emlx_email(path):
"""Improvised check to detect Apple format emails.
Warning:
Only looks at the first byte of the first line of the Apple-specific prefix.
We probably want to revisit this and check the remainder of the email message too.
"""
with path.open('rb') as f:
content = read_exactly(f, 20).decode('latin-1')
first_line = content.splitlines()[0]
return first_line.strip().isdigit()
looks_like_mbox(path)
#
Improvised check to detect MBOX format email archives.
This is done by counting for usual email headers in the file.
Warning
this does not detect MBOX files with less than 3 emails inside it.
Source code in snoop/data/magic.py
def looks_like_mbox(path):
"""Improvised check to detect MBOX format email archives.
This is done by counting for usual email headers in the file.
Warning:
this does not detect MBOX files with less than 3 emails inside it.
"""
emails = 0
pending = set(MBOX_PATTERNS)
with path.open('r', encoding='latin1') as f:
for line in f:
for pattern in pending:
if re.match(pattern, line):
pending.remove(pattern)
break
if not pending:
pending = set(MBOX_PATTERNS)
emails += 1
if emails >= MBOX_MINIMUM_EMAILS:
return True
return False