Merge pull request 'feat: Add exifread library for enhanced EXIF date extraction' (#16) from fix/exif-date-extraction-improvements into dev
All checks were successful
CI / skip-ci-check (pull_request) Successful in 10s
CI / lint-and-type-check (pull_request) Successful in 1m10s
CI / python-lint (pull_request) Successful in 35s
CI / test-backend (pull_request) Successful in 2m35s
CI / build (pull_request) Successful in 3m48s
CI / secret-scanning (pull_request) Successful in 18s
CI / dependency-scan (pull_request) Successful in 15s
CI / sast-scan (pull_request) Successful in 1m29s
CI / workflow-summary (pull_request) Successful in 9s

Reviewed-on: #16
This commit is contained in:
tanyar09 2026-01-30 12:20:37 -05:00
commit 7cfee99350
2 changed files with 232 additions and 21 deletions

View File

@ -58,9 +58,10 @@ def extract_exif_date(image_path: str) -> Optional[date]:
"""Extract date taken from photo EXIF data - returns Date (not DateTime) to match desktop schema.
Tries multiple methods to extract EXIF date:
1. PIL's getexif() (modern method) - uses .get() for tag access
2. PIL's _getexif() (deprecated but sometimes more reliable)
3. Access EXIF IFD directly if available
1. exifread library (most reliable for reading EXIF)
2. PIL's getexif() (modern method) - uses .get() for tag access
3. PIL's _getexif() (deprecated but sometimes more reliable)
4. Access EXIF IFD directly if available
Returns:
Date object or None if no valid EXIF date found
@ -68,6 +69,48 @@ def extract_exif_date(image_path: str) -> Optional[date]:
import logging
logger = logging.getLogger(__name__)
# Try exifread library first (most reliable)
try:
import exifread
with open(image_path, 'rb') as f:
tags = exifread.process_file(f, details=False)
# Look for date tags in exifread format
# exifread uses tag names like 'EXIF DateTimeOriginal', 'Image DateTime', etc.
date_tag_names = [
'EXIF DateTimeOriginal', # When photo was taken (highest priority)
'EXIF DateTimeDigitized', # When photo was digitized
'Image DateTime', # File modification date
'EXIF DateTime', # Alternative format
]
for tag_name in date_tag_names:
if tag_name in tags:
date_str = str(tags[tag_name])
if date_str:
try:
# exifread returns dates in format "YYYY:MM:DD HH:MM:SS"
dt = datetime.strptime(date_str, "%Y:%m:%d %H:%M:%S")
extracted_date = dt.date()
if extracted_date <= date.today() and extracted_date >= date(1900, 1, 1):
logger.info(f"Successfully extracted date {extracted_date} from {tag_name} using exifread for {image_path}")
return extracted_date
except ValueError:
# Try alternative format
try:
dt = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
extracted_date = dt.date()
if extracted_date <= date.today() and extracted_date >= date(1900, 1, 1):
logger.info(f"Successfully extracted date {extracted_date} from {tag_name} using exifread for {image_path}")
return extracted_date
except ValueError:
continue
except ImportError:
logger.debug("exifread library not available, falling back to PIL")
except Exception as e:
logger.debug(f"exifread failed for {image_path}: {e}, trying PIL")
# Fallback to PIL methods
try:
with Image.open(image_path) as image:
exifdata = None
@ -93,9 +136,18 @@ def extract_exif_date(image_path: str) -> Optional[date]:
logger.debug(f"Deprecated _getexif() failed for {image_path}: {e}")
if not exifdata:
logger.debug(f"No EXIF data found in {image_path}")
logger.warning(f"No EXIF data found in {image_path} - will fall back to file modification time")
return None
# Debug: Log all available EXIF tags (only in debug mode to avoid spam)
if logger.isEnabledFor(logging.DEBUG):
try:
if hasattr(exifdata, 'items'):
all_tags = list(exifdata.items())[:20] # First 20 tags for debugging
logger.debug(f"Available EXIF tags in {image_path}: {all_tags}")
except Exception:
pass
# Look for date taken in EXIF tags
# Priority: DateTimeOriginal (when photo was taken) > DateTimeDigitized > DateTime (file modification)
date_tags = [
@ -104,17 +156,96 @@ def extract_exif_date(image_path: str) -> Optional[date]:
306, # DateTime - file modification date (lowest priority)
]
# Try accessing tags - use .get() method for modern API, direct access for old API
# Also try to find any date-like tags by iterating through all tags
# This helps catch dates that might be in different tag IDs
all_date_strings = []
try:
if hasattr(exifdata, 'items'):
for tag_id, value in exifdata.items():
if value and isinstance(value, (str, bytes)):
value_str = value.decode('utf-8', errors='ignore') if isinstance(value, bytes) else str(value)
# Check if it looks like a date string (YYYY:MM:DD or YYYY-MM-DD format)
if len(value_str) >= 10 and ('-' in value_str[:10] or ':' in value_str[:10]):
try:
# Try to parse it as a date
if ':' in value_str[:10]:
test_dt = datetime.strptime(value_str[:19], "%Y:%m:%d %H:%M:%S")
else:
test_dt = datetime.strptime(value_str[:19], "%Y-%m-%d %H:%M:%S")
all_date_strings.append((tag_id, value_str, test_dt.date()))
except (ValueError, IndexError):
pass
except Exception as e:
logger.debug(f"Error iterating through all EXIF tags in {image_path}: {e}")
# Try accessing tags - use multiple methods for compatibility
for tag_id in date_tags:
try:
# Use .get() method for modern Exif object, direct access for dict-like old API
# Try multiple access methods for compatibility
date_str = None
if is_modern_api:
date_str = exifdata.get(tag_id)
# Modern getexif() API - try multiple access methods
# The Exif object from getexif() supports dictionary-like access
try:
# Method 1: Try .get() method
if hasattr(exifdata, 'get'):
date_str = exifdata.get(tag_id)
else:
date_str = None
# Method 2: If .get() returned None, try direct access
if not date_str:
try:
# Exif objects support __getitem__ for tag access
date_str = exifdata[tag_id]
except (KeyError, TypeError, AttributeError):
pass
# Method 3: Try iterating through all tags
if not date_str:
try:
# Exif objects are iterable
for key, value in exifdata.items():
if key == tag_id:
date_str = value
break
except (AttributeError, TypeError):
pass
# Method 4: Try using ExifTags.TAGS to help identify tags
if not date_str:
try:
from PIL.ExifTags import TAGS
# Log what tags are available for debugging
if logger.isEnabledFor(logging.DEBUG):
available_tag_ids = list(exifdata.keys())[:10]
logger.debug(f"Available tag IDs in {image_path}: {available_tag_ids}")
for tid in available_tag_ids:
tag_name = TAGS.get(tid, f"Unknown({tid})")
logger.debug(f" Tag {tid} ({tag_name}): {exifdata.get(tid)}")
except (ImportError, AttributeError, TypeError):
pass
except Exception as e:
logger.debug(f"Error accessing tag {tag_id} with modern API: {e}")
date_str = None
else:
# Old _getexif() returns a dict-like object
date_str = exifdata.get(tag_id) if hasattr(exifdata, 'get') else (exifdata[tag_id] if tag_id in exifdata else None)
if hasattr(exifdata, 'get'):
date_str = exifdata.get(tag_id)
elif hasattr(exifdata, '__getitem__'):
try:
if tag_id in exifdata:
date_str = exifdata[tag_id]
except (KeyError, TypeError):
pass
if date_str:
# Ensure date_str is a string, not bytes or other type
if isinstance(date_str, bytes):
date_str = date_str.decode('utf-8', errors='ignore')
elif not isinstance(date_str, str):
date_str = str(date_str)
# Parse EXIF date format (YYYY:MM:DD HH:MM:SS)
try:
dt = datetime.strptime(date_str, "%Y:%m:%d %H:%M:%S")
@ -143,6 +274,23 @@ def extract_exif_date(image_path: str) -> Optional[date]:
logger.debug(f"Error accessing tag {tag_id} in {image_path}: {e}")
continue
# If we found date strings by iterating, try them (prioritize DateTimeOriginal-like dates)
if all_date_strings:
# Sort by tag ID (lower IDs like 306, 36867, 36868 are date tags)
# Priority: DateTimeOriginal (36867) > DateTimeDigitized (36868) > DateTime (306) > others
all_date_strings.sort(key=lambda x: (
0 if x[0] == 36867 else # DateTimeOriginal first
1 if x[0] == 36868 else # DateTimeDigitized second
2 if x[0] == 306 else # DateTime third
3 # Other dates last
))
for tag_id, date_str, extracted_date in all_date_strings:
# Validate date
if extracted_date <= date.today() and extracted_date >= date(1900, 1, 1):
logger.info(f"Successfully extracted date {extracted_date} from tag {tag_id} (found by iteration) in {image_path}")
return extracted_date
# Try accessing EXIF IFD directly if available (for tags in EXIF IFD like DateTimeOriginal)
try:
if hasattr(exifdata, 'get_ifd'):
@ -152,7 +300,17 @@ def extract_exif_date(image_path: str) -> Optional[date]:
logger.debug(f"Trying EXIF IFD for {image_path}")
for tag_id in date_tags:
try:
date_str = exif_ifd.get(tag_id) if hasattr(exif_ifd, 'get') else (exif_ifd[tag_id] if tag_id in exif_ifd else None)
# Try multiple access methods for IFD
date_str = None
if hasattr(exif_ifd, 'get'):
date_str = exif_ifd.get(tag_id)
elif hasattr(exif_ifd, '__getitem__'):
try:
if tag_id in exif_ifd:
date_str = exif_ifd[tag_id]
except (KeyError, TypeError):
pass
if date_str:
try:
dt = datetime.strptime(date_str, "%Y:%m:%d %H:%M:%S")
@ -293,34 +451,86 @@ def extract_video_date(video_path: str) -> Optional[date]:
def extract_photo_date(image_path: str) -> Optional[date]:
"""Extract date taken from photo with fallback to file modification time.
"""Extract date taken from photo with fallback to file modification time, then creation time.
Tries in order:
1. EXIF date tags (DateTimeOriginal, DateTimeDigitized, DateTime)
2. File modification time (as fallback)
2. File modification time (as fallback if EXIF fails)
3. File creation time (as final fallback if modification time doesn't exist)
Returns:
Date object or None if no date can be determined
"""
import logging
import stat
logger = logging.getLogger(__name__)
# First try EXIF date extraction
date_taken = extract_exif_date(image_path)
if date_taken:
logger.info(f"Successfully extracted EXIF date {date_taken} from {image_path}")
return date_taken
# Fallback to file modification time
# EXIF extraction failed - try file modification time
logger.warning(f"EXIF date extraction failed for {image_path}, trying file modification time")
try:
if os.path.exists(image_path):
mtime = os.path.getmtime(image_path)
mtime_date = datetime.fromtimestamp(mtime).date()
# Validate date before returning (reject future dates)
if mtime_date > date.today() or mtime_date < date(1900, 1, 1):
return None # Skip invalid dates
return mtime_date
# Try modification time first
try:
mtime = os.path.getmtime(image_path)
mtime_date = datetime.fromtimestamp(mtime).date()
today = date.today()
# Reject future dates and dates that are too recent (likely copy dates)
# If modification time is within the last 7 days, it's probably a copy date, not the original photo date
days_ago = (today - mtime_date).days
if mtime_date <= today and mtime_date >= date(1900, 1, 1):
if days_ago <= 7:
# Modification time is too recent - likely a copy date, skip it
logger.debug(f"File modification time {mtime_date} is too recent (likely copy date) for {image_path}, trying creation time")
else:
# Modification time is old enough to be a real photo date
logger.info(f"Using file modification time {mtime_date} for {image_path}")
return mtime_date
else:
logger.debug(f"File modification time {mtime_date} is invalid for {image_path}, trying creation time")
except (OSError, ValueError) as e:
logger.debug(f"Failed to get modification time from {image_path}: {e}, trying creation time")
# Fallback to creation time (birthtime on some systems, ctime on others)
try:
# Try to get creation time (birthtime on macOS/BSD, ctime on Linux as fallback)
stat_info = os.stat(image_path)
# On Linux, ctime is change time (not creation), but it's the best we have
# On macOS/BSD, st_birthtime exists
if hasattr(stat_info, 'st_birthtime'):
# macOS/BSD - use birthtime (actual creation time)
ctime = stat_info.st_birthtime
else:
# Linux - use ctime (change time, closest to creation we can get)
ctime = stat_info.st_ctime
ctime_date = datetime.fromtimestamp(ctime).date()
today = date.today()
# Validate date before returning (reject future dates and recent copy dates)
days_ago = (today - ctime_date).days
if ctime_date <= today and ctime_date >= date(1900, 1, 1):
if days_ago <= 7:
# Creation time is too recent - likely a copy date, reject it
logger.warning(f"File creation time {ctime_date} is too recent (likely copy date) for {image_path}, cannot determine photo date")
return None
else:
# Creation time is old enough to be a real photo date
logger.info(f"Using file creation/change time {ctime_date} for {image_path}")
return ctime_date
else:
logger.warning(f"File creation time {ctime_date} is invalid for {image_path}")
except (OSError, ValueError, AttributeError) as e:
logger.error(f"Failed to get creation time from {image_path}: {e}")
except Exception as e:
# Log error for debugging (but don't fail the import)
import logging
logger = logging.getLogger(__name__)
logger.debug(f"Failed to get file modification time from {image_path}: {e}")
logger.error(f"Failed to get file timestamps from {image_path}: {e}")
return None

View File

@ -17,6 +17,7 @@ pytest-cov>=4.1.0
# Core Dependencies
numpy>=1.21.0
pillow>=8.0.0
exifread>=3.0.0
click>=8.0.0
setuptools>=40.0.0