From c2d3b51d7e8ffa8417dc698841599cb56c725775 Mon Sep 17 00:00:00 2001 From: Mateo LOSTANLEN Date: Mon, 14 Feb 2022 16:55:02 +0100 Subject: [PATCH 1/2] add video annotator --- pyrodataset/__init__.py | 2 + pyrodataset/splitter_utils.py | 192 ++++++++++++++++++++++++++++++ pyrodataset/version.py | 2 +- pyrodataset/video_splitter.py | 217 ++++++++++++++++++++++++++++++++++ requirements.txt | 4 +- setup.py | 6 +- test/requirements.txt | 1 + test/test_splitter_utils.py | 31 +++++ test/test_video_splitter.py | 118 ++++++++++++++++++ 9 files changed, 570 insertions(+), 3 deletions(-) create mode 100644 pyrodataset/splitter_utils.py create mode 100644 pyrodataset/video_splitter.py create mode 100644 test/test_splitter_utils.py create mode 100644 test/test_video_splitter.py diff --git a/pyrodataset/__init__.py b/pyrodataset/__init__.py index d359da9..101ee2d 100644 --- a/pyrodataset/__init__.py +++ b/pyrodataset/__init__.py @@ -4,3 +4,5 @@ from .wildfire import WildFireDataset from .check_annotations import * from .parse_annotations import * +from .splitter_utils import * +from .video_splitter import * diff --git a/pyrodataset/splitter_utils.py b/pyrodataset/splitter_utils.py new file mode 100644 index 0000000..7cf995b --- /dev/null +++ b/pyrodataset/splitter_utils.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import cv2 +import pytesseract +import re +import difflib +import math +from functools import partial + + +def findTextBoxes(image, min_height=5e-3, max_height=1, min_width=5e-3, max_width=1, relative=True): + """ + Return a generator with all parts of the image that contain text and are within min/max width and height + from https://www.danvk.org/2015/01/07/finding-blocks-of-text-in-an-image-using-python-opencv-and-numpy.html + + Args: + image: image + min_height: float (default: 5e-3), minimum image height for text box + max_height: float (default: 0.9), maximum image height for text box + min_width: float (default: 5e-3), minimum image width for text box + max_width: float (default: 0.9), maximum image width for text box + relative: bool (default: True), compute min and max values relative to image size + + """ + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # grayscale + #_,thresh = cv2.threshold(gray,150,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) # threshold + _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY) # threshold + kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) + dilated = cv2.dilate(thresh, kernel, iterations=13) # dilate + contours, hierarchy = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) # get contours + + # for each contour found, draw a rectangle around it on original image + for contour in contours: + # get rectangle bounding contour + x, y, w, h = cv2.boundingRect(contour) + # discard areas that are too large + H, W = (h, w) if not relative else 1. * np.array((h, w)) / image.shape[:2] + if (min_height <= H <= max_height) and (min_width <= W <= max_width): + yield x, y, w, h + #else: print (W,H, image.shape) + + +def findFirstTextBox(image, **kw): + """ + Return first text box in image + + Args: + image: image + kw: keyword arguments for findTextBoxes + """ + try: + x, y, w, h = next(findTextBoxes(image, **kw)) + return image[x:x + w + 1, y:y + h + 1] + except StopIteration: + return image + + +def findLargestTextBox(image, **kw): + """ + """ + try: + x, y, w, h = max(findTextBoxes(image, **kw), key=lambda x: x[2] * x[3]) + return image[x:x + w + 1, y:y + h + 1] + except ValueError: + return image + + +def findBlackBand(frame, threshold=10): + """ + Return image from the first row where the mean of the pixels is < threshold + + Args: + frame: image + threshold: float (default: 10) + """ + mean = frame.mean(axis=(1, 2)) + ymin = np.argmax(mean < threshold) + return frame[ymin:] + + +def bottomFraction(image, fraction=0.05): + """ + Return bottom fraction of image + + Args: + image: image + fraction: float (default: 0.05) + """ + return image[int((1 - fraction) * image.shape[0]):] + + +def resizeImage(image, ratio=1, min_width=None, min_height=None): + """ + Resize image by ratio or to obtain at least a minimum width or height + + Args: + ratio: float, resizing ratio (default: 1) + min_width: int, minimum width in pixels or None (default) + min_heigth: int, minimum heigth in pixels or None (default) + """ + height, width = image.shape[:2] + ratio = max(ratio or 1, (min_width or width) / width, (min_height or height) / height) + new_size = math.ceil(ratio * width), math.ceil(ratio * height) + return cv2.resize(image, new_size) + + +def prepareOCR(frame, min_height=100, threshold=10, + grayscale=False, invert_colors=False, + cropFcn=lambda x: findBlackBand(bottomFraction(x))): + """ + Return a pre-processed image for applying OCR + + Args: + frame: image, 3D array (height, width, channels) + + min_height: int, minimum height of extraction image in pixels (default: 100). + The image is resized to reach the desired height + N.B.: The OCR was found to perform poorly in relatively small images + + grayscale: bool (default: True). Convert image to grayscale + + invert_colors: bool (default: True). Invert image colors (use 255 - values) + + cropFcn: function to crop image (or None) + """ + if cropFcn is not None: + frame = cropFcn(frame) + + height, width = frame.shape[:2] + if min_height is not None and height < min_height: + # Resize by a factor 2 until it reaches min_height + ratio = min_height / height + new_size = math.ceil(ratio * width), math.ceil(ratio * height) + frame = cv2.resize(frame, new_size) + + if grayscale: + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + + if invert_colors: + frame = 255 - frame + + return frame + + +frame_to_string = partial(pytesseract.image_to_string, config='--psm 7') # single line + + +def extract_coordinates(caption, pattern=r'(X|x)+:(\S+) (¥|Y)+:(\S+) (Z|2|7)+:(\S+)', + indices=[2, 4, 6, 7], + possible_matches=[], + fixes={'\n': ''}): + """ + Extract and return the coordinates from the given caption. ValueError is raised if extraction fails + + Args: + caption: str, the caption to be processed + pattern: str, the pattern used to extract the coordinates + indices: list of ints, the indices to retain from the pattern matching + possible_matches: list of possible near-matches to be used as reference + fixes: dictionary with replacements, used to correct (known) problems with OCR + """ + fixed_caption = str(caption) + for fix in fixes.items(): + fixed_caption = caption.replace(*fix) + try: + coordinates = tuple(map(re.split(pattern, fixed_caption).__getitem__, indices)) + except IndexError: + raise ValueError('Problem extracting coordinates from caption', caption) + + # Remove datetime information from location name (last 2 words) + coordinates = coordinates[:-1] + (' '.join(coordinates[-1].split()[:-2]),) + + # Replace location name (last item) by close match if (x,y,z) matches + possibilities = set(i[-1] for i in possible_matches if i[:-1] == coordinates[:-1]) + matches = difflib.get_close_matches(coordinates[-1], possibilities, n=1, cutoff=0.9) + if matches: + coordinates = coordinates[:-1] + tuple(matches) + return coordinates + + +def extract_timestamp(caption, pattern=r'(\d{4}(:|/)\d{2}(:|/)\d{2})', split=True): + """ + Return a string with date and time extracted from the given string + """ + match = re.split(pattern, caption) + try: + date = match[1] + time = match[-1].split()[0] + except IndexError: + raise ValueError('Problem extracting timestamp from caption', caption) + return ' '.join([date, time]) diff --git a/pyrodataset/version.py b/pyrodataset/version.py index eef2619..2483f18 100644 --- a/pyrodataset/version.py +++ b/pyrodataset/version.py @@ -1 +1 @@ -__version__ = '0.0.1.dev0+b04d341' +__version__ = '0.0.1.dev0+00f7d6b' diff --git a/pyrodataset/video_splitter.py b/pyrodataset/video_splitter.py new file mode 100644 index 0000000..b2ada2a --- /dev/null +++ b/pyrodataset/video_splitter.py @@ -0,0 +1,217 @@ +import os +import pickle +from bisect import bisect_left, bisect_right +from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip +import cv2 +from pyrodataset.splitter_utils import frame_to_string, prepareOCR, extract_coordinates + + +class VideoSplitter: + """ + To split a video into sequences of frames with the same camera settings (coordinates x,y,z, location name). + + The coordinates are extracted by applying OCR to the bottom of the frames. + Since OCR is very slow, in order to avoid analysing all frames the splitting is done using binary search + and assuming the same settings/coordinates only appear in one continuous sequence. + + Args: + fname: video file name + captions: dict or file containing captions + acceptCloseMatches: accept locations with similar names to try to bypass OCR problems + max_frames: int, default: 200. Maximum number of frames to analyse + img_preprocessing (optional): function used to prepare image for OCR + frame_to_string (optional): function used to extract caption from frame + extract_coordinates (optional): function used to extract coordinates from caption + """ + def __init__(self, fname, captions=None, + acceptCloseMatches=True, max_frames=200, + frame_to_string=frame_to_string, + img_preprocessing=prepareOCR, + extract_coordinates=extract_coordinates): + self.fname = fname + self.acceptCloseMatches = acceptCloseMatches + self.max_frames = max_frames + self.img_preprocessing = img_preprocessing + self.frame_to_string = frame_to_string + self.extract_coordinates = extract_coordinates + + self.video = cv2.VideoCapture(fname) + self.Nframes = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT)) + assert self.Nframes > 0, f'Invalid video file {fname}' + self.fps = self.video.get(cv2.CAP_PROP_FPS) + + self.captions = {} # (frame_index, caption) + self.coordinates = {} # (frame_index, camera position in x,y,z) N.B.: strings, not float + # (coordinates, first frame where coordinates appeared): + # represents a sorted list of frames, used for defining the start and end of sequences by bisection + self.seqID = {} + self.sequences = {} + if captions is not None: + self.loadCaptions(captions) + + def loadCaptions(self, captions): + """ + Load captions from the given dictionary, file or path and process them + in order to extract coordinates and seqID. + In case of a path, look for /_captions.pickle + """ + if not isinstance(captions, dict): + if os.path.isdir(captions): + basename, ext = os.path.splitext(os.path.basename(self.fname)) + captions = os.path.join(captions, f'{basename}_captions.pickle') + with open(captions, 'rb') as capFile: + captions = pickle.load(capFile) + self.captions = captions + for frame_index in captions: + self.processFrame(frame_index) + + def loadFrame(self, frame_index): + """ + Load and return the frame with the given index or None if reading fails + """ + self.video.set(cv2.CAP_PROP_POS_FRAMES, frame_index) + success, frame = self.video.read() + return frame if success else None + + def processFrame(self, frame_index, ignore_caption=False): + """ + Extract and process the caption from the given frame, storing it in + self.caption if not present, the coordinates in self.coordinates, + and check if the frame belongs to a sequence previously identified + or define a new entry in self.seqID + + Args: + frame_index: the frame number + ignore_caption: bool (default: False). Extract the caption even if it is + present in self.captions + """ + if ignore_caption or frame_index not in self.captions: + if len(self.captions) >= self.max_frames: + raise RuntimeError(f'Maximum number of frames ({self.max_frames}) analysed') + frame = self.loadFrame(frame_index) + preprocessed = self.img_preprocessing(frame) + caption = self.frame_to_string(preprocessed) + self.captions[frame_index] = caption + else: + caption = self.captions[frame_index] + try: + pm = self.seqID if self.acceptCloseMatches else [] + coordinates = self.extract_coordinates(caption, possible_matches=pm) + except ValueError: + coordinates = ('EXTRACTION FAILED', frame_index, caption) + + self.coordinates[frame_index] = coordinates + # If first time the coordinates appeared, add a new item to seqID + if coordinates not in self.seqID: + self.seqID[coordinates] = frame_index + + def getCoordinates(self, frame_index): + """ + Return the coordinates of the camera for the frame with the given index. + + """ + if frame_index not in self.coordinates: + self.processFrame(frame_index) + return self.coordinates[frame_index] + + def printCaptions(self): + "Print captions" + print('Frame \t Caption') + for i in sorted(self.captions.items()): + print(f'{i[0]} \t {i[1]}') + + def __getitem__(self, item): + """ + Return the first frame that revealed the sequence to which the given frame belongs. + This method is called by bisect to find the boundaries of the sequence + """ + return self.seqID[self.getCoordinates(item)] + + def findSequences(self): + """ + Fill dictionary self.sequences with coordinates, (first frame, last frame) for each sequence + """ + if not self.coordinates: + # fill coordinates with first and last values if empty + self[0], self[self.Nframes - 1] + while True: + # Coordinates not yet analysed + missing = dict((frame, coord) for (frame, coord) in self.coordinates.items() + if coord not in self.sequences) + if not missing: + return + # Find the start and end of each sequence corresponding to each set of coordinates + for (frame, coord) in missing.items(): + self.sequences[coord] = bisect_left(self, self[frame]), bisect_right(self, self[frame]) - 1 + + def printSequences(self): + """ + Print frame range and coordinates of each sequence + """ + if not self.sequences: + return + print('Frames \t Coordinates') + for v, k in sorted((v, k) for k, v in self.sequences.items()): + print(f'{v} \t {k}') + + def writeSequences(self, outputdir, min_frames=10): + """ + Write each sequence to outputdir as _seq_. + where fmin and fmax are the number of first and last frame + + Args: + outputdir: str, output directory + min_frames: int, default: 10. Minimum number of frames with same settings to consider a sequence + """ + self.findSequences() + if not os.path.isdir(outputdir): + os.mkdir(outputdir) + valid_sequences = filter(lambda x: x[1] - x[0] >= min_frames, self.sequences.values()) + for (fmin, fmax) in sorted(valid_sequences): + basename, ext = os.path.splitext(os.path.basename(self.fname)) + fname = os.path.join(outputdir, f'{basename}_seq{fmin}_{fmax}{ext}') + ffmpeg_extract_subclip(self.fname, fmin / self.fps, fmax / self.fps, fname) + + def writeInfo(self, outputdir): + """ + Write dictionaries in outputdir/fname_.pickle where + dictName = captions, coordinates, seqID, sequences + + Args: + outputdir: str, output directory + """ + if not os.path.isdir(outputdir): + os.mkdir(outputdir) + basename, ext = os.path.splitext(os.path.basename(self.fname)) + dicts = {'captions': self.captions, 'coordinates': self.coordinates, + 'seqID': self.seqID, 'sequences': self.sequences} + for k, v in dicts.items(): + fname = os.path.join(outputdir, f'{basename}_{k}.pickle') + with open(fname, 'wb') as pickleFile: + pickle.dump(v, pickleFile) + + def __len__(self): + return self.Nframes + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description='Split videos in sequences') + parser.add_argument('filenames', help='Video filenames', nargs='+') + parser.add_argument('--outputdir', required=True, + help='Output directory for writing sequences and info') + parser.add_argument('--captions', default=None, + help='Pickle file or directory containing captions (optional)') + parser.add_argument('--no-print', help='Do not print sequences', action='store_true') + parser.add_argument('--min-frames', help='Minimum frames to write a sequence (default: 10)', + default=10, type=int) + parser.add_argument('--max-frames', help='Maximum frames to process (default: 200)', + default=200, type=int) + args = parser.parse_args() + + for fname in args.filenames: + vs = VideoSplitter(fname, captions=args.captions, max_frames=args.max_frames) + vs.writeSequences(args.outputdir, min_frames=args.min_frames) + if not args.no_print: + vs.printSequences() + vs.writeInfo(args.outputdir) diff --git a/requirements.txt b/requirements.txt index 4220623..dd5a0b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,6 @@ torch>=1.8.0 torchvision>=0.9.0 tqdm>=4.20.0 requests>=2.20.0 -ipywidgets>=7.5.1 \ No newline at end of file +ipywidgets>=7.5.1 +moviepy>=1.0.3 +pytesseract>=0.3.4 \ No newline at end of file diff --git a/setup.py b/setup.py index a7b296c..e42d333 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,8 @@ "tqdm>=4.20.0", "requests>=2.20.0", "ipywidgets>=7.5.1", + "moviepy>=1.0.3", + "pytesseract>=0.3.4", # Testing "PyYAML>=5.1.2", "youtube-dl>=2020.3.24", @@ -77,7 +79,9 @@ def deps_list(*pkgs): deps["torchvision"], deps["tqdm"], deps["requests"], - deps["ipywidgets"] + deps["ipywidgets"], + deps["moviepy"], + deps["pytesseract"] ] extras = {} diff --git a/test/requirements.txt b/test/requirements.txt index 4078350..8e69e3e 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -3,3 +3,4 @@ youtube-dl>=2020.3.24 pafy>=0.5.5 coverage>=4.5.4 flake8>=3.6.0 +imutils>=0.5.3 diff --git a/test/test_splitter_utils.py b/test/test_splitter_utils.py new file mode 100644 index 0000000..c09a62e --- /dev/null +++ b/test/test_splitter_utils.py @@ -0,0 +1,31 @@ +import unittest +from pyrodataset.splitter_utils import extract_coordinates, extract_timestamp + + +class UtilsTester(unittest.TestCase): + """ + Test caption manipulation + """ + def setUp(self): + import urllib + import imutils + url = 'https://gist.github.com/blenzi/1cf8d14fd01494f7d9c0e34714f35c29/raw' + self.ref = eval(urllib.request.urlopen(url).read()) + self.img = imutils.url_to_image(self.ref['url']) + + def test_extract_coordinates(self): + caption = self.ref['caption'] + coordinates = self.ref['coordinates'] + self.assertEqual(extract_coordinates(caption), coordinates) + + def test_extract_timestamp(self): + caption = self.ref['caption'] + timestamp = self.ref['timestamp'] + self.assertEqual(extract_timestamp(caption), timestamp) + + def test_shape(self): + self.assertEqual(self.ref['shape'], self.img.shape) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_video_splitter.py b/test/test_video_splitter.py new file mode 100644 index 0000000..25a8d73 --- /dev/null +++ b/test/test_video_splitter.py @@ -0,0 +1,118 @@ +import unittest +import os +import pickle +from pyrodataset.video_splitter import VideoSplitter + + +def setupTester(cls): + """ + Prepare tester class for VideoSplitter + """ + import urllib.request + import yaml + # Test parameters + url = 'https://gist.githubusercontent.com/blenzi/82746e11119cb88a67603944869e29e2/raw' + cls.ref = eval(urllib.request.urlopen(url).read()) + + # Stream + if not os.path.exists(cls.ref['fname']): + import pafy + vid = pafy.new(cls.ref['url']) + stream = vid.getbest() + print(f'Downloading {stream.get_filesize()/1e6:.2f} MB') + stream.download(filepath=cls.ref['fname']) + cls.fname = cls.ref['fname'] + + # Ref captions + yamlFile = "https://gist.github.com/blenzi/02027e8973d79cd89bc601b119d2a190/raw" + with urllib.request.urlopen(yamlFile) as yF: + cls.captions = yaml.safe_load(yF) + + +class VideoTester(unittest.TestCase): + """ + Test VideoSplitter + """ + @classmethod + def setUpClass(cls): + "Setup only once for all tests" + setupTester(cls) + cls.splitter = VideoSplitter(cls.fname) + cls.testFindSequences = False # skip finding sequences (takes about 30s) + + def a_test_loadFrame(self): # call it a_ as they are executed in alphabetical order + frame = self.splitter.loadFrame(self.ref['extract']['frame']) + self.assertEqual(len(frame.shape), 3) + + def test_analyseFrame(self): + # TODO: compare caption and coordinates with expected values (modulo OCR problems) + frame_index = self.ref['extract']['frame'] + self.splitter.processFrame(frame_index) + self.assertIn(frame_index, self.splitter.captions) + self.assertIn(frame_index, self.splitter.coordinates) + + def test_findSequences(self): + "Test frame range in sequences (ignore exact coordinates)" + if not self.testFindSequences: + return + self.maxDiff = None + self.splitter.findSequences() + seqs = self.splitter.sequences + inv_seqs = dict(map(reversed, seqs.items())) # invert keys and values + self.assertEqual(inv_seqs.keys(), self.ref['sequences'].keys()) + + def test_writeSequences(self): + "Test writing movie sequences" + if not self.testFindSequences: + return + import tempfile + with tempfile.TemporaryDirectory() as tmpdirname: + self.splitter.writeSequences(tmpdirname, min_frames=0) + basename, ext = os.path.splitext(os.path.basename(self.splitter.fname)) + for fmin, fmax in self.splitter.sequences.values(): + fname = os.path.join(tmpdirname, f'{basename}_seq{fmin}_{fmax}{ext}') + self.assertTrue(os.path.exists(fname)) + + def test_writeInfo(self): + "Test writing dictionaries with captions, sequences, ..." + import tempfile + basename, ext = os.path.splitext(os.path.basename(self.splitter.fname)) + with tempfile.TemporaryDirectory() as tmpdirname: + self.splitter.writeInfo(tmpdirname) + names = 'captions', 'coordinates', 'seqID', 'sequences' + dicts = [getattr(self.splitter, name) for name in names] + self.assertTrue(any(dicts)) + for name, d in zip(names, dicts): + fname = os.path.join(tmpdirname, f'{basename}_{name}.pickle') + with open(fname, 'rb') as pickleFile: + dSaved = pickle.load(pickleFile) + self.assertEqual(d, dSaved) + + +class VideoTesterWithCaptions(VideoTester): + """ + Test VideoSplitter with captions loaded externally + """ + @classmethod + def setUpClass(cls): + "Setup only once for all tests" + setupTester(cls) + cls.splitter = VideoSplitter(cls.fname, cls.captions) + cls.testFindSequences = True + + def test_loadCaptions(self): + "Test loadCaption from pickle file" + import tempfile + with tempfile.TemporaryDirectory() as tmpdirname: + basename, ext = os.path.splitext(os.path.basename(self.splitter.fname)) + fname = os.path.join(tmpdirname, f'{basename}_captions.pickle') + with open(fname, 'wb') as pickleFile: + pickle.dump(self.captions, pickleFile) + + # Load from fname + self.splitter.loadCaptions(fname) + self.assertEqual(self.captions, self.splitter.captions) + + # Load from directory name + self.splitter.loadCaptions(tmpdirname) + self.assertEqual(self.captions, self.splitter.captions) From 1c74d16577f83512f2ecf7977a9e72f522b00c72 Mon Sep 17 00:00:00 2001 From: Mateo LOSTANLEN Date: Mon, 14 Feb 2022 17:08:24 +0100 Subject: [PATCH 2/2] add header --- pyrodataset/splitter_utils.py | 5 ++++- pyrodataset/video_splitter.py | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pyrodataset/splitter_utils.py b/pyrodataset/splitter_utils.py index 7cf995b..fa74dc1 100644 --- a/pyrodataset/splitter_utils.py +++ b/pyrodataset/splitter_utils.py @@ -1,4 +1,7 @@ -# -*- coding: utf-8 -*- +# Copyright (C) 2019-2022, Pyronear. + +# This program is licensed under the Apache License version 2. +# See LICENSE or go to for full license details. import numpy as np import cv2 diff --git a/pyrodataset/video_splitter.py b/pyrodataset/video_splitter.py index b2ada2a..4dd51bf 100644 --- a/pyrodataset/video_splitter.py +++ b/pyrodataset/video_splitter.py @@ -1,3 +1,8 @@ +# Copyright (C) 2019-2022, Pyronear. + +# This program is licensed under the Apache License version 2. +# See LICENSE or go to for full license details. + import os import pickle from bisect import bisect_left, bisect_right