From 42101ea7e71e8fbfd15c6ac521e99286cb3d9efe Mon Sep 17 00:00:00 2001 From: tanyar09 Date: Fri, 30 Jan 2026 17:19:52 +0000 Subject: [PATCH] feat: Add exifread library for enhanced EXIF date extraction - Introduced the exifread library to improve reliability in extracting EXIF date from images. - Updated the extract_exif_date function to prioritize exifread for date extraction. - Enhanced logging for successful and failed date extractions. - Added validation for extracted dates to ensure they fall within a valid range. --- backend/services/photo_service.py | 252 +++++++++++++++++++++++++++--- requirements.txt | 1 + 2 files changed, 232 insertions(+), 21 deletions(-) diff --git a/backend/services/photo_service.py b/backend/services/photo_service.py index 70cebb4..6639563 100644 --- a/backend/services/photo_service.py +++ b/backend/services/photo_service.py @@ -58,9 +58,10 @@ def extract_exif_date(image_path: str) -> Optional[date]: """Extract date taken from photo EXIF data - returns Date (not DateTime) to match desktop schema. Tries multiple methods to extract EXIF date: - 1. PIL's getexif() (modern method) - uses .get() for tag access - 2. PIL's _getexif() (deprecated but sometimes more reliable) - 3. Access EXIF IFD directly if available + 1. exifread library (most reliable for reading EXIF) + 2. PIL's getexif() (modern method) - uses .get() for tag access + 3. PIL's _getexif() (deprecated but sometimes more reliable) + 4. Access EXIF IFD directly if available Returns: Date object or None if no valid EXIF date found @@ -68,6 +69,48 @@ def extract_exif_date(image_path: str) -> Optional[date]: import logging logger = logging.getLogger(__name__) + # Try exifread library first (most reliable) + try: + import exifread + with open(image_path, 'rb') as f: + tags = exifread.process_file(f, details=False) + + # Look for date tags in exifread format + # exifread uses tag names like 'EXIF DateTimeOriginal', 'Image DateTime', etc. + date_tag_names = [ + 'EXIF DateTimeOriginal', # When photo was taken (highest priority) + 'EXIF DateTimeDigitized', # When photo was digitized + 'Image DateTime', # File modification date + 'EXIF DateTime', # Alternative format + ] + + for tag_name in date_tag_names: + if tag_name in tags: + date_str = str(tags[tag_name]) + if date_str: + try: + # exifread returns dates in format "YYYY:MM:DD HH:MM:SS" + dt = datetime.strptime(date_str, "%Y:%m:%d %H:%M:%S") + extracted_date = dt.date() + if extracted_date <= date.today() and extracted_date >= date(1900, 1, 1): + logger.info(f"Successfully extracted date {extracted_date} from {tag_name} using exifread for {image_path}") + return extracted_date + except ValueError: + # Try alternative format + try: + dt = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") + extracted_date = dt.date() + if extracted_date <= date.today() and extracted_date >= date(1900, 1, 1): + logger.info(f"Successfully extracted date {extracted_date} from {tag_name} using exifread for {image_path}") + return extracted_date + except ValueError: + continue + except ImportError: + logger.debug("exifread library not available, falling back to PIL") + except Exception as e: + logger.debug(f"exifread failed for {image_path}: {e}, trying PIL") + + # Fallback to PIL methods try: with Image.open(image_path) as image: exifdata = None @@ -93,9 +136,18 @@ def extract_exif_date(image_path: str) -> Optional[date]: logger.debug(f"Deprecated _getexif() failed for {image_path}: {e}") if not exifdata: - logger.debug(f"No EXIF data found in {image_path}") + logger.warning(f"No EXIF data found in {image_path} - will fall back to file modification time") return None + # Debug: Log all available EXIF tags (only in debug mode to avoid spam) + if logger.isEnabledFor(logging.DEBUG): + try: + if hasattr(exifdata, 'items'): + all_tags = list(exifdata.items())[:20] # First 20 tags for debugging + logger.debug(f"Available EXIF tags in {image_path}: {all_tags}") + except Exception: + pass + # Look for date taken in EXIF tags # Priority: DateTimeOriginal (when photo was taken) > DateTimeDigitized > DateTime (file modification) date_tags = [ @@ -104,17 +156,96 @@ def extract_exif_date(image_path: str) -> Optional[date]: 306, # DateTime - file modification date (lowest priority) ] - # Try accessing tags - use .get() method for modern API, direct access for old API + # Also try to find any date-like tags by iterating through all tags + # This helps catch dates that might be in different tag IDs + all_date_strings = [] + try: + if hasattr(exifdata, 'items'): + for tag_id, value in exifdata.items(): + if value and isinstance(value, (str, bytes)): + value_str = value.decode('utf-8', errors='ignore') if isinstance(value, bytes) else str(value) + # Check if it looks like a date string (YYYY:MM:DD or YYYY-MM-DD format) + if len(value_str) >= 10 and ('-' in value_str[:10] or ':' in value_str[:10]): + try: + # Try to parse it as a date + if ':' in value_str[:10]: + test_dt = datetime.strptime(value_str[:19], "%Y:%m:%d %H:%M:%S") + else: + test_dt = datetime.strptime(value_str[:19], "%Y-%m-%d %H:%M:%S") + all_date_strings.append((tag_id, value_str, test_dt.date())) + except (ValueError, IndexError): + pass + except Exception as e: + logger.debug(f"Error iterating through all EXIF tags in {image_path}: {e}") + + # Try accessing tags - use multiple methods for compatibility for tag_id in date_tags: try: - # Use .get() method for modern Exif object, direct access for dict-like old API + # Try multiple access methods for compatibility + date_str = None + if is_modern_api: - date_str = exifdata.get(tag_id) + # Modern getexif() API - try multiple access methods + # The Exif object from getexif() supports dictionary-like access + try: + # Method 1: Try .get() method + if hasattr(exifdata, 'get'): + date_str = exifdata.get(tag_id) + else: + date_str = None + + # Method 2: If .get() returned None, try direct access + if not date_str: + try: + # Exif objects support __getitem__ for tag access + date_str = exifdata[tag_id] + except (KeyError, TypeError, AttributeError): + pass + + # Method 3: Try iterating through all tags + if not date_str: + try: + # Exif objects are iterable + for key, value in exifdata.items(): + if key == tag_id: + date_str = value + break + except (AttributeError, TypeError): + pass + + # Method 4: Try using ExifTags.TAGS to help identify tags + if not date_str: + try: + from PIL.ExifTags import TAGS + # Log what tags are available for debugging + if logger.isEnabledFor(logging.DEBUG): + available_tag_ids = list(exifdata.keys())[:10] + logger.debug(f"Available tag IDs in {image_path}: {available_tag_ids}") + for tid in available_tag_ids: + tag_name = TAGS.get(tid, f"Unknown({tid})") + logger.debug(f" Tag {tid} ({tag_name}): {exifdata.get(tid)}") + except (ImportError, AttributeError, TypeError): + pass + except Exception as e: + logger.debug(f"Error accessing tag {tag_id} with modern API: {e}") + date_str = None else: # Old _getexif() returns a dict-like object - date_str = exifdata.get(tag_id) if hasattr(exifdata, 'get') else (exifdata[tag_id] if tag_id in exifdata else None) + if hasattr(exifdata, 'get'): + date_str = exifdata.get(tag_id) + elif hasattr(exifdata, '__getitem__'): + try: + if tag_id in exifdata: + date_str = exifdata[tag_id] + except (KeyError, TypeError): + pass if date_str: + # Ensure date_str is a string, not bytes or other type + if isinstance(date_str, bytes): + date_str = date_str.decode('utf-8', errors='ignore') + elif not isinstance(date_str, str): + date_str = str(date_str) # Parse EXIF date format (YYYY:MM:DD HH:MM:SS) try: dt = datetime.strptime(date_str, "%Y:%m:%d %H:%M:%S") @@ -143,6 +274,23 @@ def extract_exif_date(image_path: str) -> Optional[date]: logger.debug(f"Error accessing tag {tag_id} in {image_path}: {e}") continue + # If we found date strings by iterating, try them (prioritize DateTimeOriginal-like dates) + if all_date_strings: + # Sort by tag ID (lower IDs like 306, 36867, 36868 are date tags) + # Priority: DateTimeOriginal (36867) > DateTimeDigitized (36868) > DateTime (306) > others + all_date_strings.sort(key=lambda x: ( + 0 if x[0] == 36867 else # DateTimeOriginal first + 1 if x[0] == 36868 else # DateTimeDigitized second + 2 if x[0] == 306 else # DateTime third + 3 # Other dates last + )) + + for tag_id, date_str, extracted_date in all_date_strings: + # Validate date + if extracted_date <= date.today() and extracted_date >= date(1900, 1, 1): + logger.info(f"Successfully extracted date {extracted_date} from tag {tag_id} (found by iteration) in {image_path}") + return extracted_date + # Try accessing EXIF IFD directly if available (for tags in EXIF IFD like DateTimeOriginal) try: if hasattr(exifdata, 'get_ifd'): @@ -152,7 +300,17 @@ def extract_exif_date(image_path: str) -> Optional[date]: logger.debug(f"Trying EXIF IFD for {image_path}") for tag_id in date_tags: try: - date_str = exif_ifd.get(tag_id) if hasattr(exif_ifd, 'get') else (exif_ifd[tag_id] if tag_id in exif_ifd else None) + # Try multiple access methods for IFD + date_str = None + if hasattr(exif_ifd, 'get'): + date_str = exif_ifd.get(tag_id) + elif hasattr(exif_ifd, '__getitem__'): + try: + if tag_id in exif_ifd: + date_str = exif_ifd[tag_id] + except (KeyError, TypeError): + pass + if date_str: try: dt = datetime.strptime(date_str, "%Y:%m:%d %H:%M:%S") @@ -293,34 +451,86 @@ def extract_video_date(video_path: str) -> Optional[date]: def extract_photo_date(image_path: str) -> Optional[date]: - """Extract date taken from photo with fallback to file modification time. + """Extract date taken from photo with fallback to file modification time, then creation time. Tries in order: 1. EXIF date tags (DateTimeOriginal, DateTimeDigitized, DateTime) - 2. File modification time (as fallback) + 2. File modification time (as fallback if EXIF fails) + 3. File creation time (as final fallback if modification time doesn't exist) Returns: Date object or None if no date can be determined """ + import logging + import stat + logger = logging.getLogger(__name__) + # First try EXIF date extraction date_taken = extract_exif_date(image_path) if date_taken: + logger.info(f"Successfully extracted EXIF date {date_taken} from {image_path}") return date_taken - # Fallback to file modification time + # EXIF extraction failed - try file modification time + logger.warning(f"EXIF date extraction failed for {image_path}, trying file modification time") + try: if os.path.exists(image_path): - mtime = os.path.getmtime(image_path) - mtime_date = datetime.fromtimestamp(mtime).date() - # Validate date before returning (reject future dates) - if mtime_date > date.today() or mtime_date < date(1900, 1, 1): - return None # Skip invalid dates - return mtime_date + # Try modification time first + try: + mtime = os.path.getmtime(image_path) + mtime_date = datetime.fromtimestamp(mtime).date() + today = date.today() + # Reject future dates and dates that are too recent (likely copy dates) + # If modification time is within the last 7 days, it's probably a copy date, not the original photo date + days_ago = (today - mtime_date).days + if mtime_date <= today and mtime_date >= date(1900, 1, 1): + if days_ago <= 7: + # Modification time is too recent - likely a copy date, skip it + logger.debug(f"File modification time {mtime_date} is too recent (likely copy date) for {image_path}, trying creation time") + else: + # Modification time is old enough to be a real photo date + logger.info(f"Using file modification time {mtime_date} for {image_path}") + return mtime_date + else: + logger.debug(f"File modification time {mtime_date} is invalid for {image_path}, trying creation time") + except (OSError, ValueError) as e: + logger.debug(f"Failed to get modification time from {image_path}: {e}, trying creation time") + + # Fallback to creation time (birthtime on some systems, ctime on others) + try: + # Try to get creation time (birthtime on macOS/BSD, ctime on Linux as fallback) + stat_info = os.stat(image_path) + + # On Linux, ctime is change time (not creation), but it's the best we have + # On macOS/BSD, st_birthtime exists + if hasattr(stat_info, 'st_birthtime'): + # macOS/BSD - use birthtime (actual creation time) + ctime = stat_info.st_birthtime + else: + # Linux - use ctime (change time, closest to creation we can get) + ctime = stat_info.st_ctime + + ctime_date = datetime.fromtimestamp(ctime).date() + today = date.today() + # Validate date before returning (reject future dates and recent copy dates) + days_ago = (today - ctime_date).days + if ctime_date <= today and ctime_date >= date(1900, 1, 1): + if days_ago <= 7: + # Creation time is too recent - likely a copy date, reject it + logger.warning(f"File creation time {ctime_date} is too recent (likely copy date) for {image_path}, cannot determine photo date") + return None + else: + # Creation time is old enough to be a real photo date + logger.info(f"Using file creation/change time {ctime_date} for {image_path}") + return ctime_date + else: + logger.warning(f"File creation time {ctime_date} is invalid for {image_path}") + except (OSError, ValueError, AttributeError) as e: + logger.error(f"Failed to get creation time from {image_path}: {e}") except Exception as e: # Log error for debugging (but don't fail the import) - import logging - logger = logging.getLogger(__name__) - logger.debug(f"Failed to get file modification time from {image_path}: {e}") + logger.error(f"Failed to get file timestamps from {image_path}: {e}") return None diff --git a/requirements.txt b/requirements.txt index ccda4ca..47473c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,6 +17,7 @@ pytest-cov>=4.1.0 # Core Dependencies numpy>=1.21.0 pillow>=8.0.0 +exifread>=3.0.0 click>=8.0.0 setuptools>=40.0.0