Checkdata
snoop.data.management.commands.checkdata
#
Check data for orphaned Blobs or discrepancy between S3 and Database.
Optionally delete the Orphaned Database Blobs and S3 objects.
Classes#
Command
#
Check data for orphaned Blobs or discrepancy between S3 and Database.
Methods#
add_arguments(self, parser)
#
Adds flag to switch between running collection workers and system workers.
Source code in snoop/data/management/commands/checkdata.py
def add_arguments(self, parser):
"""Adds flag to switch between running collection workers and system workers."""
parser.add_argument('--collection', default='__ALL__',
help="Check specific collection. By default, check all of them.")
parser.add_argument('--min-age-hours', type=int, default=2,
help="Minimum object age (from date modified) for it to be checked/deleted.")
parser.add_argument('--delete-orphaned', action='store_true', default=False,
help="Delete orphaned Blob objects from the database and S3.")
handle(self, *args, **options)
#
Runs workers for either collection processing or system tasks.
Source code in snoop/data/management/commands/checkdata.py
def handle(self, *args, **options):
"""Runs workers for either collection processing or system tasks."""
logging_for_management_command()
collection = options['collection']
if collection == '__ALL__':
all_collections = list(collections.ALL.values())
else:
all_collections = [collections.ALL[collection]]
for col in all_collections:
with col.set_current():
log.info('\n============\nchecking collection %s\n==============', col.name)
errors = 0
errors += check_blobs_orphaned(options['delete_orphaned'], options['min_age_hours'])
errors += check_blobs_vs_s3()
if errors > 0:
log.error('found %s errors in collection %s', errors, col.name)
else:
log.info('collection %s has no errors', col.name)
Functions#
check_blobs_orphaned(delete = False, min_age_hours = 2)
#
Look for orphaned database Blob entries.
This is done by automatically fetching all related field named, and building a single query that checks for them all. This approach is better than manually lisiting fields, since it does not need to be updated.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
- |
delete |
if will delete found entries from Database and S3. |
required |
- |
min_age_hours |
objects edited later than this many hours ago are ignored. |
required |
Source code in snoop/data/management/commands/checkdata.py
def check_blobs_orphaned(delete=False, min_age_hours=2):
"""Look for orphaned database Blob entries.
This is done by automatically fetching all related field named, and
building a single query that checks for them all.
This approach is better than manually lisiting fields, since it does not need to be updated.
Args:
- delete: if will delete found entries from Database and S3.
- min_age_hours: objects edited later than this many hours ago are ignored.
"""
# get all related fields of model Blob.
# is_relateion=True filters out the actual fields.
# concrete=False filters out the Foreign Keys of this Blob pointing to itself (links to parents).
fields = [f.name for f in models.Blob._meta.get_fields(include_hidden=True)
if f.is_relation and not f.concrete]
log.debug('found fields: %s', str(fields))
query_args = {f + '__isnull': True for f in fields}
orphaned_blobs = (
models.Blob.objects
.filter(**query_args)
.filter(date_modified__lt=timezone.now() - timedelta(hours=min_age_hours))
.order_by('pk')
)
if not orphaned_blobs.exists():
return 0
count = orphaned_blobs.count()
total_size = orphaned_blobs.aggregate(Sum('size'))['size__sum']
log.warning('found ORPHANED BLOBS: count = %s, size = %s!', count, pretty_size(total_size))
if delete:
log.info('starting DELETE of %s Orphaned Blobs...', count)
s3_deleted, db_deleted = delete_blobs(orphaned_blobs, count)
log.warning('DELETED Orphaned Blobs: S3 count = %s, Database count = %s',
s3_deleted, db_deleted)
count = orphaned_blobs.count()
return count
check_blobs_vs_s3()
#
Check for differences between DB and S3 storage mediums.
Report on: - S3 objects not in DB - DB objects not in S3 - documents with differing sizes between DB and S3
Returns:
Type | Description |
---|---|
|
the number of distinct errors |
Source code in snoop/data/management/commands/checkdata.py
def check_blobs_vs_s3():
"""Check for differences between DB and S3 storage mediums.
Report on:
- S3 objects not in DB
- DB objects not in S3
- documents with differing sizes between DB and S3
Returns:
the number of distinct errors
"""
def s3_hash_size_iter():
"""Generator that returns (sha, size) tuples in order from s3."""
s3_object_iterator = settings.BLOBS_S3.\
list_objects(collections.current().name, recursive=True)
for obj in s3_object_iterator:
if obj.is_dir:
continue
s3_sha3 = obj.object_name.replace('/', '')
s3_size = obj.size
yield s3_sha3, s3_size
def db_hash_size_iter():
"""Generator that returns (sha, size) tuples in order from db."""
db_iterator = models.Blob.objects.filter(collection_source_key=b'')\
.order_by('pk').values('pk', 'size', 'date_modified')
for vals in db_iterator:
yield vals['pk'], vals['size']
s3_iter = s3_hash_size_iter()
db_iter = db_hash_size_iter()
size_mismatch_count = 0
size_mismatch_total_size = 0
missing_from_s3_count = 0
missing_from_s3_total_size = 0
missing_from_db_count = 0
missing_from_db_total_size = 0
s3_current = next(s3_iter, None)
db_current = next(db_iter, None)
s3_size = 0
db_size = 0
# while both iterators have items, compare the heads.
# if the head item hashes are equal, check for size difference.
# if they are different, then save the smaller one, and iterate the respective one.
while s3_current is not None and db_current is not None:
s3_hash, s3_size = s3_current
db_hash, db_size = db_current
if s3_hash == db_hash:
if s3_size != db_size:
size_mismatch_total_size += max(s3_size, db_size)
size_mismatch_count += 1
s3_current = next(s3_iter, None)
db_current = next(db_iter, None)
elif s3_hash < db_hash:
missing_from_db_count += 1
missing_from_db_total_size += s3_size
s3_current = next(s3_iter, None)
else:
missing_from_s3_count += 1
missing_from_s3_total_size += db_size
db_current = next(db_iter, None)
while s3_current is not None:
missing_from_db_count += 1
missing_from_db_total_size += s3_size
s3_current = next(s3_iter, None)
while db_current is not None:
missing_from_s3_count += 1
missing_from_s3_total_size += db_size
db_current = next(db_iter, None)
if size_mismatch_count:
log.warning('found SIZE MISMATCH: count = %s, size = %s',
size_mismatch_count, pretty_size(size_mismatch_total_size))
if missing_from_db_count:
log.warning('found MISSING FROM DB but present in S3: count = %s, size = %s',
missing_from_db_count, pretty_size(missing_from_db_total_size))
if missing_from_s3_count:
log.warning('found MISSING FROM S3 but present in DB: count = %s , size = %s',
missing_from_s3_count, pretty_size(missing_from_s3_total_size))
return missing_from_db_count + missing_from_s3_count
delete_blobs(blob_iterator, expected_count)
#
Delete Database and S3 entries for Blobs using this iterator.
Reports progress as percent.
Returns a (s3, db) tuple with actual number of items deleted.
Source code in snoop/data/management/commands/checkdata.py
def delete_blobs(blob_iterator, expected_count):
"""Delete Database and S3 entries for Blobs using this iterator.
Reports progress as percent.
Returns a (s3, db) tuple with actual number of items deleted.
"""
deleted_s3 = 0
deleted_db = 0
expected_count += 1
UPDATE_EVERY = math.ceil(expected_count / 11)
for i, blob in enumerate(blob_iterator):
if (i + 1) % UPDATE_EVERY == 0:
log.info('DELETE %s%%', int(100 * i / expected_count))
try:
settings.BLOBS_S3.remove_object(collections.current().name,
models.blob_repo_path(blob.pk))
deleted_s3 += 1
except Exception as e:
log.debug(e)
blob.delete()
deleted_db += 1
return deleted_s3, deleted_db