import os import sys import lib.io import lib.geo from lib.exif import EXIF, verify_exif from collections import OrderedDict import datetime ''' Sequence class for organizing/cleaning up photos in a folder - split to sequences based on time intervals - split to sequences based on gps distances - remove duplicate images (e.g. waiting for red light, in traffic etc) @simonmikkelsen ''' MAXIMUM_SEQUENCE_LENGTH = 1000 class Sequence(object): def __init__(self, filepath, skip_folders=[], skip_subfolders=False, check_exif=True): self.filepath = filepath self._skip_folders = skip_folders self._skip_subfolders = skip_subfolders self.file_list = self.get_file_list(filepath, check_exif) self.num_images = len(self.file_list) def _is_skip(self, filepath): ''' Skip photos in specified folders - filepath/duplicates: it stores potential duplicate photos detected by method 'remove_duplicates' - filepath/success: it stores photos that have been successfully ''' _is_skip = False for folder in self._skip_folders: if folder in filepath: _is_skip = True if self._skip_subfolders and filepath != self.filepath: _is_skip = True return _is_skip def _read_capture_time(self, filename): ''' Use EXIF class to parse capture time from EXIF. ''' exif = EXIF(filename) return exif.extract_capture_time() def _read_lat_lon(self, filename): ''' Use EXIF class to parse latitude and longitude from EXIF. ''' exif = EXIF(filename) lon, lat = exif.extract_lon_lat() return lat, lon def _read_direction(self, filename): ''' Use EXIF class to parse compass direction from EXIF. ''' exif = EXIF(filename) direction = exif.extract_direction() return direction def get_file_list(self, filepath, check_exif=True): ''' Get the list of JPEGs in the folder (nested folders) ''' if filepath.lower().endswith(".jpg"): # single file file_list = [filepath] else: file_list = [] for root, sub_folders, files in os.walk(self.filepath): if not self._is_skip(root): image_files = [os.path.join(root, filename) for filename in files if (filename.lower().endswith(".jpg"))] if check_exif: image_files = [f for f in image_files if verify_exif(f)] file_list += image_files return file_list def sort_file_list(self, file_list): ''' Read capture times and sort files in time order. ''' if len(file_list) == 0: return [], [] capture_times = [self._read_capture_time(filepath) for filepath in file_list] sorted_times_files = zip(capture_times, file_list) sorted_times_files.sort() return zip(*sorted_times_files) def move_groups(self, groups, sub_path=''): ''' Move the files in the groups to new folders. ''' for i,group in enumerate(groups): new_dir = os.path.join(self.filepath, sub_path, str(i)) lib.io.mkdir_p(new_dir) for filepath in group: os.rename(filepath, os.path.join(new_dir, os.path.basename(filepath))) print("Moved {0} photos to {1}".format(len(group), new_dir)) def set_skip_folders(self, folders): ''' Set folders to skip when iterating through the path ''' self._skip_folders = folders def set_file_list(self, file_list): ''' Set file list for the sequence ''' self.file_list = file_list def split(self, cutoff_distance=500., cutoff_time=None, max_sequence_length=MAXIMUM_SEQUENCE_LENGTH, move_files=True, verbose=False, skip_cutoff=False): ''' Split photos into sequences in case of large distance gap or large time interval @params cutoff_distance: maximum distance gap in meters @params cutoff_time: maximum time interval in seconds (if None, use 1.5 x median time interval in the sequence) ''' file_list = self.file_list groups = [] if len(file_list) >= 1: # sort based on EXIF capture time capture_times, file_list = self.sort_file_list(file_list) # diff in capture time capture_deltas = [t2-t1 for t1,t2 in zip(capture_times, capture_times[1:])] # read gps for ordered files latlons = [self._read_lat_lon(filepath) for filepath in file_list] # distance between consecutive images distances = [lib.geo.gps_distance(ll1, ll2) for ll1, ll2 in zip(latlons, latlons[1:])] # if cutoff time is given use that, else assume cutoff is 1.5x median time delta if cutoff_time is None: if verbose: print "Cut-off time is None" median = sorted(capture_deltas)[len(capture_deltas)//2] if type(median) is not int: median = median.total_seconds() cutoff_time = 1.5*median # extract groups by cutting using cutoff time group = [file_list[0]] cut = 0 for i,filepath in enumerate(file_list[1:]): cut_time = capture_deltas[i].total_seconds() > cutoff_time cut_distance = distances[i] > cutoff_distance cut_sequence_length = len(group) > max_sequence_length if cut_time or cut_distance or cut_sequence_length: cut += 1 # delta too big, save current group, start new groups.append(group) group = [filepath] if verbose: if cut_distance: print 'Cut {}: Delta in distance {} meters is too bigger than cutoff_distance {} meters at {}'.format(cut,distances[i], cutoff_distance, file_list[i+1]) elif cut_time: print 'Cut {}: Delta in time {} seconds is bigger then cutoff_time {} seconds at {}'.format(cut, capture_deltas[i].total_seconds(), cutoff_time, file_list[i+1]) elif cut_sequence_length: print 'Cut {}: Maximum sequence length {} reached at {}'.format(cut, max_sequence_length, file_list[i+1]) else: group.append(filepath) groups.append(group) # move groups to subfolders if move_files: self.move_groups(groups) print("Done split photos in {} into {} sequences".format(self.filepath, len(groups))) return groups def interpolate_direction(self, offset=0): ''' Interpolate bearing of photos in a sequence with an offset @author: mprins ''' bearings = {} file_list = self.file_list num_file = len(file_list) if num_file > 1: # sort based on EXIF capture time capture_times, file_list = self.sort_file_list(file_list) # read gps for ordered files latlons = [self._read_lat_lon(filepath) for filepath in file_list] if len(file_list) > 1: # bearing between consecutive images bearings = [lib.geo.compute_bearing(ll1[0], ll1[1], ll2[0], ll2[1]) for ll1, ll2 in zip(latlons, latlons[1:])] bearings.append(bearings[-1]) bearings = {file_list[i]: lib.geo.offset_bearing(b, offset) for i, b in enumerate(bearings)} elif num_file == 1: #if there is only one file in the list, just write the direction 0 and offset bearings = {file_list[0]: lib.geo.offset_bearing(0.0, offset)} return bearings def interpolate_timestamp(self): ''' Interpolate time stamps in case of identical timestamps within a sequence ''' timestamps = [] file_list = self.file_list num_file = len(file_list) time_dict = OrderedDict() capture_times, file_list = self.sort_file_list(file_list) if num_file < 2: return capture_times, file_list # trace identical timestamps (always assume capture_times is sorted) time_dict = OrderedDict() for i, t in enumerate(capture_times): if t not in time_dict: time_dict[t] = { "count": 0, "pointer": 0 } interval = 0 if i != 0: interval = (t - capture_times[i-1]).total_seconds() time_dict[capture_times[i-1]]["interval"] = interval time_dict[t]["count"] += 1 if len(time_dict) >= 2: # set time interval as the last available time interval time_dict[time_dict.keys()[-1]]["interval"] = time_dict[time_dict.keys()[-2]]["interval"] else: # set time interval assuming capture interval is 1 second time_dict[time_dict.keys()[0]]["interval"] = time_dict[time_dict.keys()[0]]["count"] * 1. # interpolate timestampes for f, t in zip(file_list, capture_times): d = time_dict[t] s = datetime.timedelta(seconds=d["pointer"] * d["interval"] / float(d["count"])) updated_time = t + s time_dict[t]["pointer"] += 1 timestamps.append(updated_time) return timestamps, file_list def remove_duplicates(self, min_distance=1e-5, min_angle=5): ''' Detect duplidate photos in a folder @source: a less general version of @simonmikkelsen's duplicate remover ''' file_list = self.file_list # ordered list by time capture_times, file_list = self.sort_file_list(file_list) # read gps for ordered files latlons = [self._read_lat_lon(filepath) for filepath in file_list] # read bearing for ordered files bearings = [self._read_direction(filepath) for filepath in file_list] # interploated bearings interpolated_bearings = [lib.geo.compute_bearing(ll1[0], ll1[1], ll2[0], ll2[1]) for ll1, ll2 in zip(latlons, latlons[1:])] interpolated_bearings.append(bearings[-1]) # use interploated bearings if bearing not available in EXIF for i, b in enumerate(bearings): bearings[i] = b if b is not None else interpolated_bearings[i] is_duplicate = False prev_unique = file_list[0] prev_latlon = latlons[0] prev_bearing = bearings[0] groups = [] group = [] for i, filename in enumerate(file_list[1:]): k = i+1 distance = lib.geo.gps_distance(latlons[k], prev_latlon) if bearings[k] is not None and prev_bearing is not None: bearing_diff = lib.geo.diff_bearing(bearings[k], prev_bearing) else: # Not use bearing difference if no bearings are available bearing_diff = 360 if distance < min_distance and bearing_diff < min_angle: is_duplicate = True else: prev_latlon = latlons[k] prev_bearing = bearings[k] if is_duplicate: group.append(filename) else: if group: groups.append(group) group = [] is_duplicate = False groups.append(group) # move to filepath/duplicates/group_id (TODO: uploader should skip the duplicate folder) self.move_groups(groups, 'duplicates') print("Done remove duplicate photos in {} into {} groups".format(self.filepath, len(groups))) return groups