Merge pull request 'feat: Add exifread library for enhanced EXIF date extraction' (#16) from fix/exif-date-extraction-improvements into dev
All checks were successful
CI / skip-ci-check (pull_request) Successful in 10s
CI / lint-and-type-check (pull_request) Successful in 1m10s
CI / python-lint (pull_request) Successful in 35s
CI / test-backend (pull_request) Successful in 2m35s
CI / build (pull_request) Successful in 3m48s
CI / secret-scanning (pull_request) Successful in 18s
CI / dependency-scan (pull_request) Successful in 15s
CI / sast-scan (pull_request) Successful in 1m29s
CI / workflow-summary (pull_request) Successful in 9s
All checks were successful
CI / skip-ci-check (pull_request) Successful in 10s
CI / lint-and-type-check (pull_request) Successful in 1m10s
CI / python-lint (pull_request) Successful in 35s
CI / test-backend (pull_request) Successful in 2m35s
CI / build (pull_request) Successful in 3m48s
CI / secret-scanning (pull_request) Successful in 18s
CI / dependency-scan (pull_request) Successful in 15s
CI / sast-scan (pull_request) Successful in 1m29s
CI / workflow-summary (pull_request) Successful in 9s
Reviewed-on: #16
This commit is contained in:
commit
7cfee99350
@ -58,9 +58,10 @@ def extract_exif_date(image_path: str) -> Optional[date]:
|
||||
"""Extract date taken from photo EXIF data - returns Date (not DateTime) to match desktop schema.
|
||||
|
||||
Tries multiple methods to extract EXIF date:
|
||||
1. PIL's getexif() (modern method) - uses .get() for tag access
|
||||
2. PIL's _getexif() (deprecated but sometimes more reliable)
|
||||
3. Access EXIF IFD directly if available
|
||||
1. exifread library (most reliable for reading EXIF)
|
||||
2. PIL's getexif() (modern method) - uses .get() for tag access
|
||||
3. PIL's _getexif() (deprecated but sometimes more reliable)
|
||||
4. Access EXIF IFD directly if available
|
||||
|
||||
Returns:
|
||||
Date object or None if no valid EXIF date found
|
||||
@ -68,6 +69,48 @@ def extract_exif_date(image_path: str) -> Optional[date]:
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try exifread library first (most reliable)
|
||||
try:
|
||||
import exifread
|
||||
with open(image_path, 'rb') as f:
|
||||
tags = exifread.process_file(f, details=False)
|
||||
|
||||
# Look for date tags in exifread format
|
||||
# exifread uses tag names like 'EXIF DateTimeOriginal', 'Image DateTime', etc.
|
||||
date_tag_names = [
|
||||
'EXIF DateTimeOriginal', # When photo was taken (highest priority)
|
||||
'EXIF DateTimeDigitized', # When photo was digitized
|
||||
'Image DateTime', # File modification date
|
||||
'EXIF DateTime', # Alternative format
|
||||
]
|
||||
|
||||
for tag_name in date_tag_names:
|
||||
if tag_name in tags:
|
||||
date_str = str(tags[tag_name])
|
||||
if date_str:
|
||||
try:
|
||||
# exifread returns dates in format "YYYY:MM:DD HH:MM:SS"
|
||||
dt = datetime.strptime(date_str, "%Y:%m:%d %H:%M:%S")
|
||||
extracted_date = dt.date()
|
||||
if extracted_date <= date.today() and extracted_date >= date(1900, 1, 1):
|
||||
logger.info(f"Successfully extracted date {extracted_date} from {tag_name} using exifread for {image_path}")
|
||||
return extracted_date
|
||||
except ValueError:
|
||||
# Try alternative format
|
||||
try:
|
||||
dt = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
|
||||
extracted_date = dt.date()
|
||||
if extracted_date <= date.today() and extracted_date >= date(1900, 1, 1):
|
||||
logger.info(f"Successfully extracted date {extracted_date} from {tag_name} using exifread for {image_path}")
|
||||
return extracted_date
|
||||
except ValueError:
|
||||
continue
|
||||
except ImportError:
|
||||
logger.debug("exifread library not available, falling back to PIL")
|
||||
except Exception as e:
|
||||
logger.debug(f"exifread failed for {image_path}: {e}, trying PIL")
|
||||
|
||||
# Fallback to PIL methods
|
||||
try:
|
||||
with Image.open(image_path) as image:
|
||||
exifdata = None
|
||||
@ -93,9 +136,18 @@ def extract_exif_date(image_path: str) -> Optional[date]:
|
||||
logger.debug(f"Deprecated _getexif() failed for {image_path}: {e}")
|
||||
|
||||
if not exifdata:
|
||||
logger.debug(f"No EXIF data found in {image_path}")
|
||||
logger.warning(f"No EXIF data found in {image_path} - will fall back to file modification time")
|
||||
return None
|
||||
|
||||
# Debug: Log all available EXIF tags (only in debug mode to avoid spam)
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
try:
|
||||
if hasattr(exifdata, 'items'):
|
||||
all_tags = list(exifdata.items())[:20] # First 20 tags for debugging
|
||||
logger.debug(f"Available EXIF tags in {image_path}: {all_tags}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Look for date taken in EXIF tags
|
||||
# Priority: DateTimeOriginal (when photo was taken) > DateTimeDigitized > DateTime (file modification)
|
||||
date_tags = [
|
||||
@ -104,17 +156,96 @@ def extract_exif_date(image_path: str) -> Optional[date]:
|
||||
306, # DateTime - file modification date (lowest priority)
|
||||
]
|
||||
|
||||
# Try accessing tags - use .get() method for modern API, direct access for old API
|
||||
# Also try to find any date-like tags by iterating through all tags
|
||||
# This helps catch dates that might be in different tag IDs
|
||||
all_date_strings = []
|
||||
try:
|
||||
if hasattr(exifdata, 'items'):
|
||||
for tag_id, value in exifdata.items():
|
||||
if value and isinstance(value, (str, bytes)):
|
||||
value_str = value.decode('utf-8', errors='ignore') if isinstance(value, bytes) else str(value)
|
||||
# Check if it looks like a date string (YYYY:MM:DD or YYYY-MM-DD format)
|
||||
if len(value_str) >= 10 and ('-' in value_str[:10] or ':' in value_str[:10]):
|
||||
try:
|
||||
# Try to parse it as a date
|
||||
if ':' in value_str[:10]:
|
||||
test_dt = datetime.strptime(value_str[:19], "%Y:%m:%d %H:%M:%S")
|
||||
else:
|
||||
test_dt = datetime.strptime(value_str[:19], "%Y-%m-%d %H:%M:%S")
|
||||
all_date_strings.append((tag_id, value_str, test_dt.date()))
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.debug(f"Error iterating through all EXIF tags in {image_path}: {e}")
|
||||
|
||||
# Try accessing tags - use multiple methods for compatibility
|
||||
for tag_id in date_tags:
|
||||
try:
|
||||
# Use .get() method for modern Exif object, direct access for dict-like old API
|
||||
# Try multiple access methods for compatibility
|
||||
date_str = None
|
||||
|
||||
if is_modern_api:
|
||||
date_str = exifdata.get(tag_id)
|
||||
# Modern getexif() API - try multiple access methods
|
||||
# The Exif object from getexif() supports dictionary-like access
|
||||
try:
|
||||
# Method 1: Try .get() method
|
||||
if hasattr(exifdata, 'get'):
|
||||
date_str = exifdata.get(tag_id)
|
||||
else:
|
||||
date_str = None
|
||||
|
||||
# Method 2: If .get() returned None, try direct access
|
||||
if not date_str:
|
||||
try:
|
||||
# Exif objects support __getitem__ for tag access
|
||||
date_str = exifdata[tag_id]
|
||||
except (KeyError, TypeError, AttributeError):
|
||||
pass
|
||||
|
||||
# Method 3: Try iterating through all tags
|
||||
if not date_str:
|
||||
try:
|
||||
# Exif objects are iterable
|
||||
for key, value in exifdata.items():
|
||||
if key == tag_id:
|
||||
date_str = value
|
||||
break
|
||||
except (AttributeError, TypeError):
|
||||
pass
|
||||
|
||||
# Method 4: Try using ExifTags.TAGS to help identify tags
|
||||
if not date_str:
|
||||
try:
|
||||
from PIL.ExifTags import TAGS
|
||||
# Log what tags are available for debugging
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
available_tag_ids = list(exifdata.keys())[:10]
|
||||
logger.debug(f"Available tag IDs in {image_path}: {available_tag_ids}")
|
||||
for tid in available_tag_ids:
|
||||
tag_name = TAGS.get(tid, f"Unknown({tid})")
|
||||
logger.debug(f" Tag {tid} ({tag_name}): {exifdata.get(tid)}")
|
||||
except (ImportError, AttributeError, TypeError):
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.debug(f"Error accessing tag {tag_id} with modern API: {e}")
|
||||
date_str = None
|
||||
else:
|
||||
# Old _getexif() returns a dict-like object
|
||||
date_str = exifdata.get(tag_id) if hasattr(exifdata, 'get') else (exifdata[tag_id] if tag_id in exifdata else None)
|
||||
if hasattr(exifdata, 'get'):
|
||||
date_str = exifdata.get(tag_id)
|
||||
elif hasattr(exifdata, '__getitem__'):
|
||||
try:
|
||||
if tag_id in exifdata:
|
||||
date_str = exifdata[tag_id]
|
||||
except (KeyError, TypeError):
|
||||
pass
|
||||
|
||||
if date_str:
|
||||
# Ensure date_str is a string, not bytes or other type
|
||||
if isinstance(date_str, bytes):
|
||||
date_str = date_str.decode('utf-8', errors='ignore')
|
||||
elif not isinstance(date_str, str):
|
||||
date_str = str(date_str)
|
||||
# Parse EXIF date format (YYYY:MM:DD HH:MM:SS)
|
||||
try:
|
||||
dt = datetime.strptime(date_str, "%Y:%m:%d %H:%M:%S")
|
||||
@ -143,6 +274,23 @@ def extract_exif_date(image_path: str) -> Optional[date]:
|
||||
logger.debug(f"Error accessing tag {tag_id} in {image_path}: {e}")
|
||||
continue
|
||||
|
||||
# If we found date strings by iterating, try them (prioritize DateTimeOriginal-like dates)
|
||||
if all_date_strings:
|
||||
# Sort by tag ID (lower IDs like 306, 36867, 36868 are date tags)
|
||||
# Priority: DateTimeOriginal (36867) > DateTimeDigitized (36868) > DateTime (306) > others
|
||||
all_date_strings.sort(key=lambda x: (
|
||||
0 if x[0] == 36867 else # DateTimeOriginal first
|
||||
1 if x[0] == 36868 else # DateTimeDigitized second
|
||||
2 if x[0] == 306 else # DateTime third
|
||||
3 # Other dates last
|
||||
))
|
||||
|
||||
for tag_id, date_str, extracted_date in all_date_strings:
|
||||
# Validate date
|
||||
if extracted_date <= date.today() and extracted_date >= date(1900, 1, 1):
|
||||
logger.info(f"Successfully extracted date {extracted_date} from tag {tag_id} (found by iteration) in {image_path}")
|
||||
return extracted_date
|
||||
|
||||
# Try accessing EXIF IFD directly if available (for tags in EXIF IFD like DateTimeOriginal)
|
||||
try:
|
||||
if hasattr(exifdata, 'get_ifd'):
|
||||
@ -152,7 +300,17 @@ def extract_exif_date(image_path: str) -> Optional[date]:
|
||||
logger.debug(f"Trying EXIF IFD for {image_path}")
|
||||
for tag_id in date_tags:
|
||||
try:
|
||||
date_str = exif_ifd.get(tag_id) if hasattr(exif_ifd, 'get') else (exif_ifd[tag_id] if tag_id in exif_ifd else None)
|
||||
# Try multiple access methods for IFD
|
||||
date_str = None
|
||||
if hasattr(exif_ifd, 'get'):
|
||||
date_str = exif_ifd.get(tag_id)
|
||||
elif hasattr(exif_ifd, '__getitem__'):
|
||||
try:
|
||||
if tag_id in exif_ifd:
|
||||
date_str = exif_ifd[tag_id]
|
||||
except (KeyError, TypeError):
|
||||
pass
|
||||
|
||||
if date_str:
|
||||
try:
|
||||
dt = datetime.strptime(date_str, "%Y:%m:%d %H:%M:%S")
|
||||
@ -293,34 +451,86 @@ def extract_video_date(video_path: str) -> Optional[date]:
|
||||
|
||||
|
||||
def extract_photo_date(image_path: str) -> Optional[date]:
|
||||
"""Extract date taken from photo with fallback to file modification time.
|
||||
"""Extract date taken from photo with fallback to file modification time, then creation time.
|
||||
|
||||
Tries in order:
|
||||
1. EXIF date tags (DateTimeOriginal, DateTimeDigitized, DateTime)
|
||||
2. File modification time (as fallback)
|
||||
2. File modification time (as fallback if EXIF fails)
|
||||
3. File creation time (as final fallback if modification time doesn't exist)
|
||||
|
||||
Returns:
|
||||
Date object or None if no date can be determined
|
||||
"""
|
||||
import logging
|
||||
import stat
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# First try EXIF date extraction
|
||||
date_taken = extract_exif_date(image_path)
|
||||
if date_taken:
|
||||
logger.info(f"Successfully extracted EXIF date {date_taken} from {image_path}")
|
||||
return date_taken
|
||||
|
||||
# Fallback to file modification time
|
||||
# EXIF extraction failed - try file modification time
|
||||
logger.warning(f"EXIF date extraction failed for {image_path}, trying file modification time")
|
||||
|
||||
try:
|
||||
if os.path.exists(image_path):
|
||||
mtime = os.path.getmtime(image_path)
|
||||
mtime_date = datetime.fromtimestamp(mtime).date()
|
||||
# Validate date before returning (reject future dates)
|
||||
if mtime_date > date.today() or mtime_date < date(1900, 1, 1):
|
||||
return None # Skip invalid dates
|
||||
return mtime_date
|
||||
# Try modification time first
|
||||
try:
|
||||
mtime = os.path.getmtime(image_path)
|
||||
mtime_date = datetime.fromtimestamp(mtime).date()
|
||||
today = date.today()
|
||||
# Reject future dates and dates that are too recent (likely copy dates)
|
||||
# If modification time is within the last 7 days, it's probably a copy date, not the original photo date
|
||||
days_ago = (today - mtime_date).days
|
||||
if mtime_date <= today and mtime_date >= date(1900, 1, 1):
|
||||
if days_ago <= 7:
|
||||
# Modification time is too recent - likely a copy date, skip it
|
||||
logger.debug(f"File modification time {mtime_date} is too recent (likely copy date) for {image_path}, trying creation time")
|
||||
else:
|
||||
# Modification time is old enough to be a real photo date
|
||||
logger.info(f"Using file modification time {mtime_date} for {image_path}")
|
||||
return mtime_date
|
||||
else:
|
||||
logger.debug(f"File modification time {mtime_date} is invalid for {image_path}, trying creation time")
|
||||
except (OSError, ValueError) as e:
|
||||
logger.debug(f"Failed to get modification time from {image_path}: {e}, trying creation time")
|
||||
|
||||
# Fallback to creation time (birthtime on some systems, ctime on others)
|
||||
try:
|
||||
# Try to get creation time (birthtime on macOS/BSD, ctime on Linux as fallback)
|
||||
stat_info = os.stat(image_path)
|
||||
|
||||
# On Linux, ctime is change time (not creation), but it's the best we have
|
||||
# On macOS/BSD, st_birthtime exists
|
||||
if hasattr(stat_info, 'st_birthtime'):
|
||||
# macOS/BSD - use birthtime (actual creation time)
|
||||
ctime = stat_info.st_birthtime
|
||||
else:
|
||||
# Linux - use ctime (change time, closest to creation we can get)
|
||||
ctime = stat_info.st_ctime
|
||||
|
||||
ctime_date = datetime.fromtimestamp(ctime).date()
|
||||
today = date.today()
|
||||
# Validate date before returning (reject future dates and recent copy dates)
|
||||
days_ago = (today - ctime_date).days
|
||||
if ctime_date <= today and ctime_date >= date(1900, 1, 1):
|
||||
if days_ago <= 7:
|
||||
# Creation time is too recent - likely a copy date, reject it
|
||||
logger.warning(f"File creation time {ctime_date} is too recent (likely copy date) for {image_path}, cannot determine photo date")
|
||||
return None
|
||||
else:
|
||||
# Creation time is old enough to be a real photo date
|
||||
logger.info(f"Using file creation/change time {ctime_date} for {image_path}")
|
||||
return ctime_date
|
||||
else:
|
||||
logger.warning(f"File creation time {ctime_date} is invalid for {image_path}")
|
||||
except (OSError, ValueError, AttributeError) as e:
|
||||
logger.error(f"Failed to get creation time from {image_path}: {e}")
|
||||
except Exception as e:
|
||||
# Log error for debugging (but don't fail the import)
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.debug(f"Failed to get file modification time from {image_path}: {e}")
|
||||
logger.error(f"Failed to get file timestamps from {image_path}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@ -17,6 +17,7 @@ pytest-cov>=4.1.0
|
||||
# Core Dependencies
|
||||
numpy>=1.21.0
|
||||
pillow>=8.0.0
|
||||
exifread>=3.0.0
|
||||
click>=8.0.0
|
||||
setuptools>=40.0.0
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user