318 lines
12 KiB
Python
318 lines
12 KiB
Python
|
import os
|
||
|
import sys
|
||
|
import lib.io
|
||
|
import lib.geo
|
||
|
from lib.exif import EXIF, verify_exif
|
||
|
from collections import OrderedDict
|
||
|
import datetime
|
||
|
|
||
|
'''
|
||
|
Sequence class for organizing/cleaning up photos in a folder
|
||
|
- split to sequences based on time intervals
|
||
|
- split to sequences based on gps distances
|
||
|
- remove duplicate images (e.g. waiting for red light, in traffic etc) @simonmikkelsen
|
||
|
'''
|
||
|
|
||
|
MAXIMUM_SEQUENCE_LENGTH = 1000
|
||
|
|
||
|
class Sequence(object):
|
||
|
|
||
|
def __init__(self, filepath, skip_folders=[], skip_subfolders=False, check_exif=True):
|
||
|
self.filepath = filepath
|
||
|
self._skip_folders = skip_folders
|
||
|
self._skip_subfolders = skip_subfolders
|
||
|
self.file_list = self.get_file_list(filepath, check_exif)
|
||
|
self.num_images = len(self.file_list)
|
||
|
|
||
|
def _is_skip(self, filepath):
|
||
|
'''
|
||
|
Skip photos in specified folders
|
||
|
- filepath/duplicates: it stores potential duplicate photos
|
||
|
detected by method 'remove_duplicates'
|
||
|
- filepath/success: it stores photos that have been successfully
|
||
|
'''
|
||
|
_is_skip = False
|
||
|
for folder in self._skip_folders:
|
||
|
if folder in filepath:
|
||
|
_is_skip = True
|
||
|
if self._skip_subfolders and filepath != self.filepath:
|
||
|
_is_skip = True
|
||
|
return _is_skip
|
||
|
|
||
|
def _read_capture_time(self, filename):
|
||
|
'''
|
||
|
Use EXIF class to parse capture time from EXIF.
|
||
|
'''
|
||
|
exif = EXIF(filename)
|
||
|
return exif.extract_capture_time()
|
||
|
|
||
|
def _read_lat_lon(self, filename):
|
||
|
'''
|
||
|
Use EXIF class to parse latitude and longitude from EXIF.
|
||
|
'''
|
||
|
exif = EXIF(filename)
|
||
|
lon, lat = exif.extract_lon_lat()
|
||
|
return lat, lon
|
||
|
|
||
|
def _read_direction(self, filename):
|
||
|
'''
|
||
|
Use EXIF class to parse compass direction from EXIF.
|
||
|
'''
|
||
|
exif = EXIF(filename)
|
||
|
direction = exif.extract_direction()
|
||
|
return direction
|
||
|
|
||
|
def get_file_list(self, filepath, check_exif=True):
|
||
|
'''
|
||
|
Get the list of JPEGs in the folder (nested folders)
|
||
|
'''
|
||
|
if filepath.lower().endswith(".jpg"):
|
||
|
# single file
|
||
|
file_list = [filepath]
|
||
|
else:
|
||
|
file_list = []
|
||
|
for root, sub_folders, files in os.walk(self.filepath):
|
||
|
if not self._is_skip(root):
|
||
|
image_files = [os.path.join(root, filename) for filename in files if (filename.lower().endswith(".jpg"))]
|
||
|
if check_exif:
|
||
|
image_files = [f for f in image_files if verify_exif(f)]
|
||
|
file_list += image_files
|
||
|
return file_list
|
||
|
|
||
|
def sort_file_list(self, file_list):
|
||
|
'''
|
||
|
Read capture times and sort files in time order.
|
||
|
'''
|
||
|
if len(file_list) == 0:
|
||
|
return [], []
|
||
|
capture_times = [self._read_capture_time(filepath) for filepath in file_list]
|
||
|
sorted_times_files = zip(capture_times, file_list)
|
||
|
sorted_times_files.sort()
|
||
|
return zip(*sorted_times_files)
|
||
|
|
||
|
def move_groups(self, groups, sub_path=''):
|
||
|
'''
|
||
|
Move the files in the groups to new folders.
|
||
|
'''
|
||
|
for i,group in enumerate(groups):
|
||
|
new_dir = os.path.join(self.filepath, sub_path, str(i))
|
||
|
lib.io.mkdir_p(new_dir)
|
||
|
for filepath in group:
|
||
|
os.rename(filepath, os.path.join(new_dir, os.path.basename(filepath)))
|
||
|
print("Moved {0} photos to {1}".format(len(group), new_dir))
|
||
|
|
||
|
def set_skip_folders(self, folders):
|
||
|
'''
|
||
|
Set folders to skip when iterating through the path
|
||
|
'''
|
||
|
self._skip_folders = folders
|
||
|
|
||
|
def set_file_list(self, file_list):
|
||
|
'''
|
||
|
Set file list for the sequence
|
||
|
'''
|
||
|
self.file_list = file_list
|
||
|
|
||
|
def split(self, cutoff_distance=500., cutoff_time=None, max_sequence_length=MAXIMUM_SEQUENCE_LENGTH, move_files=True, verbose=False, skip_cutoff=False):
|
||
|
'''
|
||
|
Split photos into sequences in case of large distance gap or large time interval
|
||
|
@params cutoff_distance: maximum distance gap in meters
|
||
|
@params cutoff_time: maximum time interval in seconds (if None, use 1.5 x median time interval in the sequence)
|
||
|
'''
|
||
|
|
||
|
file_list = self.file_list
|
||
|
groups = []
|
||
|
|
||
|
if len(file_list) >= 1:
|
||
|
# sort based on EXIF capture time
|
||
|
capture_times, file_list = self.sort_file_list(file_list)
|
||
|
|
||
|
# diff in capture time
|
||
|
capture_deltas = [t2-t1 for t1,t2 in zip(capture_times, capture_times[1:])]
|
||
|
|
||
|
# read gps for ordered files
|
||
|
latlons = [self._read_lat_lon(filepath) for filepath in file_list]
|
||
|
|
||
|
# distance between consecutive images
|
||
|
distances = [lib.geo.gps_distance(ll1, ll2) for ll1, ll2 in zip(latlons, latlons[1:])]
|
||
|
|
||
|
# if cutoff time is given use that, else assume cutoff is 1.5x median time delta
|
||
|
if cutoff_time is None:
|
||
|
if verbose:
|
||
|
print "Cut-off time is None"
|
||
|
median = sorted(capture_deltas)[len(capture_deltas)//2]
|
||
|
if type(median) is not int:
|
||
|
median = median.total_seconds()
|
||
|
cutoff_time = 1.5*median
|
||
|
|
||
|
# extract groups by cutting using cutoff time
|
||
|
group = [file_list[0]]
|
||
|
cut = 0
|
||
|
for i,filepath in enumerate(file_list[1:]):
|
||
|
cut_time = capture_deltas[i].total_seconds() > cutoff_time
|
||
|
cut_distance = distances[i] > cutoff_distance
|
||
|
cut_sequence_length = len(group) > max_sequence_length
|
||
|
if cut_time or cut_distance or cut_sequence_length:
|
||
|
cut += 1
|
||
|
# delta too big, save current group, start new
|
||
|
groups.append(group)
|
||
|
group = [filepath]
|
||
|
if verbose:
|
||
|
if cut_distance:
|
||
|
print 'Cut {}: Delta in distance {} meters is too bigger than cutoff_distance {} meters at {}'.format(cut,distances[i], cutoff_distance, file_list[i+1])
|
||
|
elif cut_time:
|
||
|
print 'Cut {}: Delta in time {} seconds is bigger then cutoff_time {} seconds at {}'.format(cut, capture_deltas[i].total_seconds(), cutoff_time, file_list[i+1])
|
||
|
elif cut_sequence_length:
|
||
|
print 'Cut {}: Maximum sequence length {} reached at {}'.format(cut, max_sequence_length, file_list[i+1])
|
||
|
else:
|
||
|
group.append(filepath)
|
||
|
|
||
|
groups.append(group)
|
||
|
|
||
|
# move groups to subfolders
|
||
|
if move_files:
|
||
|
self.move_groups(groups)
|
||
|
|
||
|
print("Done split photos in {} into {} sequences".format(self.filepath, len(groups)))
|
||
|
return groups
|
||
|
|
||
|
def interpolate_direction(self, offset=0):
|
||
|
'''
|
||
|
Interpolate bearing of photos in a sequence with an offset
|
||
|
@author: mprins
|
||
|
'''
|
||
|
|
||
|
bearings = {}
|
||
|
file_list = self.file_list
|
||
|
num_file = len(file_list)
|
||
|
|
||
|
if num_file > 1:
|
||
|
# sort based on EXIF capture time
|
||
|
capture_times, file_list = self.sort_file_list(file_list)
|
||
|
|
||
|
# read gps for ordered files
|
||
|
latlons = [self._read_lat_lon(filepath) for filepath in file_list]
|
||
|
|
||
|
if len(file_list) > 1:
|
||
|
# bearing between consecutive images
|
||
|
bearings = [lib.geo.compute_bearing(ll1[0], ll1[1], ll2[0], ll2[1])
|
||
|
for ll1, ll2 in zip(latlons, latlons[1:])]
|
||
|
bearings.append(bearings[-1])
|
||
|
bearings = {file_list[i]: lib.geo.offset_bearing(b, offset) for i, b in enumerate(bearings)}
|
||
|
elif num_file == 1:
|
||
|
#if there is only one file in the list, just write the direction 0 and offset
|
||
|
bearings = {file_list[0]: lib.geo.offset_bearing(0.0, offset)}
|
||
|
|
||
|
return bearings
|
||
|
|
||
|
def interpolate_timestamp(self):
|
||
|
'''
|
||
|
Interpolate time stamps in case of identical timestamps within a sequence
|
||
|
'''
|
||
|
timestamps = []
|
||
|
file_list = self.file_list
|
||
|
num_file = len(file_list)
|
||
|
|
||
|
time_dict = OrderedDict()
|
||
|
capture_times, file_list = self.sort_file_list(file_list)
|
||
|
|
||
|
if num_file < 2:
|
||
|
return capture_times, file_list
|
||
|
|
||
|
# trace identical timestamps (always assume capture_times is sorted)
|
||
|
time_dict = OrderedDict()
|
||
|
for i, t in enumerate(capture_times):
|
||
|
if t not in time_dict:
|
||
|
time_dict[t] = {
|
||
|
"count": 0,
|
||
|
"pointer": 0
|
||
|
}
|
||
|
|
||
|
interval = 0
|
||
|
if i != 0:
|
||
|
interval = (t - capture_times[i-1]).total_seconds()
|
||
|
time_dict[capture_times[i-1]]["interval"] = interval
|
||
|
|
||
|
time_dict[t]["count"] += 1
|
||
|
|
||
|
if len(time_dict) >= 2:
|
||
|
# set time interval as the last available time interval
|
||
|
time_dict[time_dict.keys()[-1]]["interval"] = time_dict[time_dict.keys()[-2]]["interval"]
|
||
|
else:
|
||
|
# set time interval assuming capture interval is 1 second
|
||
|
time_dict[time_dict.keys()[0]]["interval"] = time_dict[time_dict.keys()[0]]["count"] * 1.
|
||
|
|
||
|
# interpolate timestampes
|
||
|
for f, t in zip(file_list, capture_times):
|
||
|
d = time_dict[t]
|
||
|
s = datetime.timedelta(seconds=d["pointer"] * d["interval"] / float(d["count"]))
|
||
|
updated_time = t + s
|
||
|
time_dict[t]["pointer"] += 1
|
||
|
timestamps.append(updated_time)
|
||
|
|
||
|
return timestamps, file_list
|
||
|
|
||
|
|
||
|
def remove_duplicates(self, min_distance=1e-5, min_angle=5):
|
||
|
'''
|
||
|
Detect duplidate photos in a folder
|
||
|
@source: a less general version of @simonmikkelsen's duplicate remover
|
||
|
'''
|
||
|
file_list = self.file_list
|
||
|
|
||
|
# ordered list by time
|
||
|
capture_times, file_list = self.sort_file_list(file_list)
|
||
|
|
||
|
# read gps for ordered files
|
||
|
latlons = [self._read_lat_lon(filepath) for filepath in file_list]
|
||
|
|
||
|
# read bearing for ordered files
|
||
|
bearings = [self._read_direction(filepath) for filepath in file_list]
|
||
|
|
||
|
# interploated bearings
|
||
|
interpolated_bearings = [lib.geo.compute_bearing(ll1[0], ll1[1], ll2[0], ll2[1])
|
||
|
for ll1, ll2 in zip(latlons, latlons[1:])]
|
||
|
interpolated_bearings.append(bearings[-1])
|
||
|
|
||
|
# use interploated bearings if bearing not available in EXIF
|
||
|
for i, b in enumerate(bearings):
|
||
|
bearings[i] = b if b is not None else interpolated_bearings[i]
|
||
|
|
||
|
is_duplicate = False
|
||
|
|
||
|
prev_unique = file_list[0]
|
||
|
prev_latlon = latlons[0]
|
||
|
prev_bearing = bearings[0]
|
||
|
groups = []
|
||
|
group = []
|
||
|
for i, filename in enumerate(file_list[1:]):
|
||
|
k = i+1
|
||
|
distance = lib.geo.gps_distance(latlons[k], prev_latlon)
|
||
|
if bearings[k] is not None and prev_bearing is not None:
|
||
|
bearing_diff = lib.geo.diff_bearing(bearings[k], prev_bearing)
|
||
|
else:
|
||
|
# Not use bearing difference if no bearings are available
|
||
|
bearing_diff = 360
|
||
|
if distance < min_distance and bearing_diff < min_angle:
|
||
|
is_duplicate = True
|
||
|
else:
|
||
|
prev_latlon = latlons[k]
|
||
|
prev_bearing = bearings[k]
|
||
|
|
||
|
if is_duplicate:
|
||
|
group.append(filename)
|
||
|
else:
|
||
|
if group:
|
||
|
groups.append(group)
|
||
|
group = []
|
||
|
|
||
|
is_duplicate = False
|
||
|
groups.append(group)
|
||
|
|
||
|
# move to filepath/duplicates/group_id (TODO: uploader should skip the duplicate folder)
|
||
|
self.move_groups(groups, 'duplicates')
|
||
|
print("Done remove duplicate photos in {} into {} groups".format(self.filepath, len(groups)))
|
||
|
|
||
|
return groups
|
||
|
|