server/image-hash: do not depend on image-match
While I hold this library in great esteem for its excellent work on implementing the original paper, I have several problems with it: - as of this commit, it (again) has bug fixes unreleased on pip - its code is badly structured - forces OOP and then proceeds @staticmethod everything - bad class design, parameters are repeated in several places - terrible contract of make_record() and generate_signature() - ambiguous parameters: path vs. image path vs. image content - doesn't adhere to PEP-8 - depends on cairo just to render svg images almost no one uses this library with
This commit is contained in:
parent
894cd29511
commit
fd30675124
4 changed files with 267 additions and 45 deletions
|
@ -7,7 +7,6 @@ pytest-cov>=2.2.1
|
|||
freezegun>=0.3.6
|
||||
coloredlogs==5.0
|
||||
pycodestyle>=2.0.0
|
||||
image-match>=1.1.0
|
||||
scipy>=0.18.1
|
||||
elasticsearch>=5.0.0
|
||||
elasticsearch-dsl>=5.0.0
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
import logging
|
||||
from io import BytesIO
|
||||
from datetime import datetime
|
||||
import elasticsearch
|
||||
import elasticsearch_dsl
|
||||
import xml.etree
|
||||
from image_match.elasticsearch_driver import SignatureES
|
||||
import numpy as np
|
||||
from skimage.color import rgb2gray
|
||||
from PIL import Image
|
||||
from szurubooru import config, errors
|
||||
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
logger = logging.getLogger(__name__)
|
||||
es = elasticsearch.Elasticsearch([{
|
||||
|
@ -14,11 +16,190 @@ es = elasticsearch.Elasticsearch([{
|
|||
}])
|
||||
|
||||
|
||||
def _get_session():
|
||||
return SignatureES(es, index=config.config['elasticsearch']['index'])
|
||||
# Math based on paper from H. Chi Wong, Marshall Bern and David Goldber
|
||||
# Math code taken from https://github.com/ascribe/image-match
|
||||
# (which is licensed under Apache 2 license)
|
||||
|
||||
LOWER_PERCENTILE = 5
|
||||
UPPER_PERCENTILE = 95
|
||||
IDENTICAL_TOLERANCE = 2 / 255.
|
||||
DISTANCE_CUTOFF = 0.45
|
||||
N_LEVELS = 2
|
||||
N = 9
|
||||
P = None
|
||||
SAMPLE_WORDS = 16
|
||||
MAX_WORDS = 63
|
||||
ES_DOC_TYPE = 'image'
|
||||
ES_MAX_RESULTS = 100
|
||||
|
||||
|
||||
def _safe_blanket(default_param_factory):
|
||||
def _preprocess_image(image_or_path):
|
||||
img = Image.open(BytesIO(image_or_path))
|
||||
img = img.convert('RGB')
|
||||
return rgb2gray(np.asarray(img, dtype=np.uint8))
|
||||
|
||||
|
||||
def _crop_image(image, lower_percentile, upper_percentile):
|
||||
rw = np.cumsum(np.sum(np.abs(np.diff(image, axis=1)), axis=1))
|
||||
cw = np.cumsum(np.sum(np.abs(np.diff(image, axis=0)), axis=0))
|
||||
upper_column_limit = np.searchsorted(
|
||||
cw, np.percentile(cw, upper_percentile), side='left')
|
||||
lower_column_limit = np.searchsorted(
|
||||
cw, np.percentile(cw, lower_percentile), side='right')
|
||||
upper_row_limit = np.searchsorted(
|
||||
rw, np.percentile(rw, upper_percentile), side='left')
|
||||
lower_row_limit = np.searchsorted(
|
||||
rw, np.percentile(rw, lower_percentile), side='right')
|
||||
if lower_row_limit > upper_row_limit:
|
||||
lower_row_limit = int(lower_percentile / 100. * image.shape[0])
|
||||
upper_row_limit = int(upper_percentile / 100. * image.shape[0])
|
||||
if lower_column_limit > upper_column_limit:
|
||||
lower_column_limit = int(lower_percentile / 100. * image.shape[1])
|
||||
upper_column_limit = int(upper_percentile / 100. * image.shape[1])
|
||||
return [
|
||||
(lower_row_limit, upper_row_limit),
|
||||
(lower_column_limit, upper_column_limit)]
|
||||
|
||||
|
||||
def _normalize_and_threshold(diff_array, identical_tolerance, n_levels):
|
||||
mask = np.abs(diff_array) < identical_tolerance
|
||||
diff_array[mask] = 0.
|
||||
if np.all(mask):
|
||||
return None
|
||||
positive_cutoffs = np.percentile(
|
||||
diff_array[diff_array > 0.], np.linspace(0, 100, n_levels+1))
|
||||
negative_cutoffs = np.percentile(
|
||||
diff_array[diff_array < 0.], np.linspace(100, 0, n_levels+1))
|
||||
for level, interval in enumerate(
|
||||
positive_cutoffs[i:i+2]
|
||||
for i in range(positive_cutoffs.shape[0] - 1)):
|
||||
diff_array[
|
||||
(diff_array >= interval[0]) & (diff_array <= interval[1])] = \
|
||||
level + 1
|
||||
for level, interval in enumerate(
|
||||
negative_cutoffs[i:i+2]
|
||||
for i in range(negative_cutoffs.shape[0] - 1)):
|
||||
diff_array[
|
||||
(diff_array <= interval[0]) & (diff_array >= interval[1])] = \
|
||||
-(level + 1)
|
||||
return None
|
||||
|
||||
|
||||
def _compute_grid_points(image, n, window=None):
|
||||
if window is None:
|
||||
window = [(0, image.shape[0]), (0, image.shape[1])]
|
||||
x_coords = np.linspace(window[0][0], window[0][1], n + 2, dtype=int)[1:-1]
|
||||
y_coords = np.linspace(window[1][0], window[1][1], n + 2, dtype=int)[1:-1]
|
||||
return x_coords, y_coords
|
||||
|
||||
|
||||
def _compute_mean_level(image, x_coords, y_coords, p):
|
||||
if p is None:
|
||||
p = max([2.0, int(0.5 + min(image.shape) / 20.)])
|
||||
avg_grey = np.zeros((x_coords.shape[0], y_coords.shape[0]))
|
||||
for i, x in enumerate(x_coords):
|
||||
lower_x_lim = int(max([x - p / 2, 0]))
|
||||
upper_x_lim = int(min([lower_x_lim + p, image.shape[0]]))
|
||||
for j, y in enumerate(y_coords):
|
||||
lower_y_lim = int(max([y - p / 2, 0]))
|
||||
upper_y_lim = int(min([lower_y_lim + p, image.shape[1]]))
|
||||
avg_grey[i, j] = np.mean(
|
||||
image[lower_x_lim:upper_x_lim, lower_y_lim:upper_y_lim])
|
||||
return avg_grey
|
||||
|
||||
|
||||
def _compute_differentials(grey_level_matrix):
|
||||
flipped = np.fliplr(grey_level_matrix)
|
||||
right_neighbors = -np.concatenate((
|
||||
np.diff(grey_level_matrix),
|
||||
np.zeros(grey_level_matrix.shape[0])
|
||||
.reshape((grey_level_matrix.shape[0], 1))), axis=1)
|
||||
down_neighbors = -np.concatenate((
|
||||
np.diff(grey_level_matrix, axis=0),
|
||||
np.zeros(grey_level_matrix.shape[1])
|
||||
.reshape((1, grey_level_matrix.shape[1]))))
|
||||
left_neighbors = -np.concatenate(
|
||||
(right_neighbors[:, -1:], right_neighbors[:, :-1]), axis=1)
|
||||
up_neighbors = -np.concatenate((down_neighbors[-1:], down_neighbors[:-1]))
|
||||
diagonals = np.arange(
|
||||
-grey_level_matrix.shape[0] + 1, grey_level_matrix.shape[0])
|
||||
upper_left_neighbors = sum([
|
||||
np.diagflat(np.insert(np.diff(np.diag(grey_level_matrix, i)), 0, 0), i)
|
||||
for i in diagonals])
|
||||
upper_right_neighbors = sum([
|
||||
np.diagflat(np.insert(np.diff(np.diag(flipped, i)), 0, 0), i)
|
||||
for i in diagonals])
|
||||
lower_right_neighbors = -np.pad(
|
||||
upper_left_neighbors[1:, 1:], (0, 1), mode='constant')
|
||||
lower_left_neighbors = -np.pad(
|
||||
upper_right_neighbors[1:, 1:], (0, 1), mode='constant')
|
||||
return np.dstack(np.array([
|
||||
upper_left_neighbors,
|
||||
up_neighbors,
|
||||
np.fliplr(upper_right_neighbors),
|
||||
left_neighbors,
|
||||
right_neighbors,
|
||||
np.fliplr(lower_left_neighbors),
|
||||
down_neighbors,
|
||||
lower_right_neighbors]))
|
||||
|
||||
|
||||
def _generate_signature(path_or_image):
|
||||
im_array = _preprocess_image(path_or_image)
|
||||
image_limits = _crop_image(im_array,
|
||||
lower_percentile=LOWER_PERCENTILE,
|
||||
upper_percentile=UPPER_PERCENTILE)
|
||||
x_coords, y_coords = _compute_grid_points(
|
||||
im_array, n=N, window=image_limits)
|
||||
avg_grey = _compute_mean_level(im_array, x_coords, y_coords, p=P)
|
||||
diff_matrix = _compute_differentials(avg_grey)
|
||||
_normalize_and_threshold(diff_matrix,
|
||||
identical_tolerance=IDENTICAL_TOLERANCE, n_levels=N_LEVELS)
|
||||
return np.ravel(diff_matrix).astype('int8')
|
||||
|
||||
|
||||
def _get_words(array, k, n):
|
||||
word_positions = np.linspace(
|
||||
0, array.shape[0], n, endpoint=False).astype('int')
|
||||
assert k <= array.shape[0]
|
||||
assert word_positions.shape[0] <= array.shape[0]
|
||||
words = np.zeros((n, k)).astype('int8')
|
||||
for i, pos in enumerate(word_positions):
|
||||
if pos + k <= array.shape[0]:
|
||||
words[i] = array[pos:pos+k]
|
||||
else:
|
||||
temp = array[pos:].copy()
|
||||
temp.resize(k)
|
||||
words[i] = temp
|
||||
_max_contrast(words)
|
||||
words = _words_to_int(words)
|
||||
return words
|
||||
|
||||
|
||||
def _words_to_int(word_array):
|
||||
width = word_array.shape[1]
|
||||
coding_vector = 3**np.arange(width)
|
||||
return np.dot(word_array + 1, coding_vector)
|
||||
|
||||
|
||||
def _max_contrast(array):
|
||||
array[array > 0] = 1
|
||||
array[array < 0] = -1
|
||||
return None
|
||||
|
||||
|
||||
def _normalized_distance(_target_array, _vec, nan_value=1.0):
|
||||
target_array = _target_array.astype(int)
|
||||
vec = _vec.astype(int)
|
||||
topvec = np.linalg.norm(vec - target_array, axis=1)
|
||||
norm1 = np.linalg.norm(vec, axis=0)
|
||||
norm2 = np.linalg.norm(target_array, axis=1)
|
||||
finvec = topvec / (norm1 + norm2)
|
||||
finvec[np.isnan(finvec)] = nan_value
|
||||
return finvec
|
||||
|
||||
|
||||
def _safety_blanket(default_param_factory):
|
||||
def wrapper_outer(target_function):
|
||||
def wrapper_inner(*args, **kwargs):
|
||||
try:
|
||||
|
@ -28,14 +209,13 @@ def _safe_blanket(default_param_factory):
|
|||
# add_image()
|
||||
return default_param_factory()
|
||||
except elasticsearch.exceptions.ElasticsearchException as ex:
|
||||
logger.warning('Problem with elastic search: %s' % ex)
|
||||
logger.warning('Problem with elastic search: %s', ex)
|
||||
raise errors.ThirdPartyError(
|
||||
'Error connecting to elastic search.')
|
||||
except xml.etree.ElementTree.ParseError as ex:
|
||||
# image-match issue #60
|
||||
except IOError:
|
||||
raise errors.ProcessingError('Not an image.')
|
||||
except Exception as ex:
|
||||
raise errors.ThirdPartyError('Unknown error (%s).' % ex)
|
||||
raise errors.ThirdPartyError('Unknown error (%s).', ex)
|
||||
return wrapper_inner
|
||||
return wrapper_outer
|
||||
|
||||
|
@ -47,53 +227,96 @@ class Lookalike:
|
|||
self.path = path
|
||||
|
||||
|
||||
@_safe_blanket(lambda: None)
|
||||
@_safety_blanket(lambda: None)
|
||||
def add_image(path, image_content):
|
||||
if not path or not image_content:
|
||||
return
|
||||
session = _get_session()
|
||||
session.add_image(path=path, img=image_content, bytestream=True)
|
||||
assert path
|
||||
assert image_content
|
||||
signature = _generate_signature(image_content)
|
||||
words = _get_words(signature, k=SAMPLE_WORDS, n=MAX_WORDS)
|
||||
|
||||
record = {
|
||||
'signature': signature.tolist(),
|
||||
'path': path,
|
||||
'timestamp': datetime.now(),
|
||||
}
|
||||
for i in range(MAX_WORDS):
|
||||
record['simple_word_' + str(i)] = words[i].tolist()
|
||||
|
||||
es.index(
|
||||
index=config.config['elasticsearch']['index'],
|
||||
doc_type=ES_DOC_TYPE,
|
||||
body=record,
|
||||
refresh=True)
|
||||
|
||||
|
||||
@_safe_blanket(lambda: None)
|
||||
@_safety_blanket(lambda: None)
|
||||
def delete_image(path):
|
||||
if not path:
|
||||
return
|
||||
session = _get_session()
|
||||
assert path
|
||||
es.delete_by_query(
|
||||
index=session.index,
|
||||
doc_type=session.doc_type,
|
||||
index=config.config['elasticsearch']['index'],
|
||||
doc_type=ES_DOC_TYPE,
|
||||
body={'query': {'term': {'path': path}}})
|
||||
|
||||
|
||||
@_safe_blanket(lambda: [])
|
||||
@_safety_blanket(lambda: [])
|
||||
def search_by_image(image_content):
|
||||
signature = _generate_signature(image_content)
|
||||
words = _get_words(signature, k=SAMPLE_WORDS, n=MAX_WORDS)
|
||||
|
||||
res = es.search(
|
||||
index=config.config['elasticsearch']['index'],
|
||||
doc_type=ES_DOC_TYPE,
|
||||
body={
|
||||
'query':
|
||||
{
|
||||
'bool':
|
||||
{
|
||||
'should':
|
||||
[
|
||||
{'term': {'simple_word_%d' % i: word.tolist()}}
|
||||
for i, word in enumerate(words)
|
||||
]
|
||||
}
|
||||
},
|
||||
'_source': {'excludes': ['simple_word_*']}},
|
||||
size=ES_MAX_RESULTS,
|
||||
timeout='10s')['hits']['hits']
|
||||
|
||||
if len(res) == 0:
|
||||
return []
|
||||
|
||||
sigs = np.array([x['_source']['signature'] for x in res])
|
||||
dists = _normalized_distance(sigs, np.array(signature))
|
||||
|
||||
ids = set()
|
||||
ret = []
|
||||
session = _get_session()
|
||||
for result in session.search_image(
|
||||
path=image_content, # sic
|
||||
bytestream=True):
|
||||
ret.append(Lookalike(
|
||||
score=result['score'],
|
||||
distance=result['dist'],
|
||||
path=result['path']))
|
||||
for item, dist in zip(res, dists):
|
||||
id = item['_id']
|
||||
score = item['_score']
|
||||
path = item['_source']['path']
|
||||
if id in ids:
|
||||
continue
|
||||
ids.add(id)
|
||||
if dist < DISTANCE_CUTOFF:
|
||||
ret.append(Lookalike(score=score, distance=dist, path=path))
|
||||
return ret
|
||||
|
||||
|
||||
@_safe_blanket(lambda: None)
|
||||
@_safety_blanket(lambda: None)
|
||||
def purge():
|
||||
session = _get_session()
|
||||
es.delete_by_query(
|
||||
index=session.index,
|
||||
doc_type=session.doc_type,
|
||||
body={'query': {'match_all': {}}})
|
||||
index=config.config['elasticsearch']['index'],
|
||||
doc_type=ES_DOC_TYPE,
|
||||
body={'query': {'match_all': {}}},
|
||||
refresh=True)
|
||||
|
||||
|
||||
@_safe_blanket(lambda: set())
|
||||
@_safety_blanket(lambda: set())
|
||||
def get_all_paths():
|
||||
session = _get_session()
|
||||
search = (
|
||||
elasticsearch_dsl.Search(
|
||||
using=es, index=session.index, doc_type=session.doc_type)
|
||||
using=es,
|
||||
index=config.config['elasticsearch']['index'],
|
||||
doc_type=ES_DOC_TYPE)
|
||||
.source(['path']))
|
||||
return set(h.path for h in search.scan())
|
||||
|
|
|
@ -268,7 +268,8 @@ def _after_post_update(_mapper, _connection, post):
|
|||
|
||||
@sqlalchemy.events.event.listens_for(db.Post, 'before_delete')
|
||||
def _before_post_delete(_mapper, _connection, post):
|
||||
image_hash.delete_image(post.post_id)
|
||||
if post.post_id:
|
||||
image_hash.delete_image(post.post_id)
|
||||
|
||||
|
||||
def _sync_post_content(post):
|
||||
|
@ -279,7 +280,8 @@ def _sync_post_content(post):
|
|||
files.save(get_post_content_path(post), content)
|
||||
delattr(post, '__content')
|
||||
regenerate_thumb = True
|
||||
if post.type in (db.Post.TYPE_IMAGE, db.Post.TYPE_ANIMATION):
|
||||
if post.post_id and post.type in (
|
||||
db.Post.TYPE_IMAGE, db.Post.TYPE_ANIMATION):
|
||||
image_hash.delete_image(post.post_id)
|
||||
image_hash.add_image(post.post_id, content)
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
from time import sleep
|
||||
from szurubooru.func import image_hash
|
||||
|
||||
|
||||
|
@ -7,11 +6,10 @@ def test_hashing(read_asset, config_injector):
|
|||
image_hash.purge()
|
||||
image_hash.add_image('test', read_asset('jpeg.jpg'))
|
||||
|
||||
sleep(0.1)
|
||||
|
||||
paths = image_hash.get_all_paths()
|
||||
results_exact = image_hash.search_by_image(read_asset('jpeg.jpg'))
|
||||
results_similar = image_hash.search_by_image(read_asset('jpeg-similar.jpg'))
|
||||
results_similar = image_hash.search_by_image(
|
||||
read_asset('jpeg-similar.jpg'))
|
||||
|
||||
assert len(paths) == 1
|
||||
assert len(results_exact) == 1
|
||||
|
|
Loading…
Reference in a new issue