imported duplicate check speed

This commit is contained in:
Andreas Preikschat 2014-03-21 21:04:53 +01:00
parent ff78a99fc1
commit ed29edec3a
2 changed files with 44 additions and 21 deletions

View File

@ -31,6 +31,7 @@ The duplicate song removal logic for OpenLP.
"""
import logging
import multiprocessing
import os
from PyQt4 import QtCore, QtGui
@ -45,6 +46,17 @@ from openlp.plugins.songs.lib.songcompare import songs_probably_equal
log = logging.getLogger(__name__)
class SongIterator(object):
def __init__(self, songs):
self.songs = songs
def __iter__(self):
for outer_song_counter in range(len(self.songs) - 1):
for inner_song_counter in range(outer_song_counter + 1, len(self.songs)):
yield (self.songs[outer_song_counter], self.songs[inner_song_counter])
class DuplicateSongRemovalForm(OpenLPWizard):
"""
This is the Duplicate Song Removal Wizard. It provides functionality to search for and remove duplicate songs
@ -167,24 +179,32 @@ class DuplicateSongRemovalForm(OpenLPWizard):
max_progress_count = max_songs * (max_songs - 1) // 2
self.duplicate_search_progress_bar.setMaximum(max_progress_count)
songs = self.plugin.manager.get_all_objects(Song)
for outer_song_counter in range(max_songs - 1):
for inner_song_counter in range(outer_song_counter + 1, max_songs):
if songs_probably_equal(songs[outer_song_counter], songs[inner_song_counter]):
duplicate_added = self.add_duplicates_to_song_list(
songs[outer_song_counter], songs[inner_song_counter])
if duplicate_added:
self.found_duplicates_edit.appendPlainText(
songs[outer_song_counter].title + " = " + songs[inner_song_counter].title)
# Create a worker/process pool to check the songs.
process_number = max(1, multiprocessing.cpu_count() - 1)
pool = multiprocessing.Pool(process_number)
song_list = SongIterator(songs)
#song_list = [(songs[outer_song_counter], songs[inner_song_counter]) for outer_song_counter in range(max_songs - 1) for inner_song_counter in range(outer_song_counter + 1, max_songs)]
result = pool.imap_unordered(songs_probably_equal, song_list, 30)
# Do not accept any further tasks. Also this closes the processes if all tasks are done.
pool.close()
# While the processes are still working, start to look at the results.
for song_tuple in result:
self.duplicate_search_progress_bar.setValue(self.duplicate_search_progress_bar.value() + 1)
# The call to process_events() will keep the GUI responsive.
self.application.process_events()
if self.break_search:
pool.terminate()
return
self.review_total_count = len(self.duplicate_song_list)
if self.review_total_count == 0:
self.notify_no_duplicates()
else:
if song_tuple is None:
continue
song1, song2 = song_tuple
duplicate_added = self.add_duplicates_to_song_list(song1, song2)
if duplicate_added:
self.found_duplicates_edit.appendPlainText(song1.title + " = " + song2.title)
if self.duplicate_song_list:
self.button(QtGui.QWizard.NextButton).show()
else:
self.notify_no_duplicates()
finally:
self.application.set_normal_cursor()
elif page_id == self.review_page_id:

View File

@ -52,13 +52,15 @@ MIN_BLOCK_SIZE = 70
MAX_TYPO_SIZE = 3
def songs_probably_equal(song1, song2):
def songs_probably_equal(song1, song2=None):
"""
Calculate and return whether two songs are probably equal.
:param song1: The first song to compare.
:param song2: The second song to compare.
"""
if song2 is None:
song1, song2 = song1
if len(song1.search_lyrics) < len(song2.search_lyrics):
small = song1.search_lyrics
large = song2.search_lyrics
@ -75,8 +77,9 @@ def songs_probably_equal(song1, song2):
for element in diff_no_typos:
if element[0] == "equal" and _op_length(element) >= MIN_BLOCK_SIZE:
length_of_equal_blocks += _op_length(element)
if length_of_equal_blocks >= MIN_BLOCK_SIZE:
return True
return song1, song2
# Check 2: Similarity based on the relative length of the longest equal block.
# Calculate the length of the largest equal block of the diff set.
length_of_longest_equal_block = 0
@ -84,9 +87,9 @@ def songs_probably_equal(song1, song2):
if element[0] == "equal" and _op_length(element) > length_of_longest_equal_block:
length_of_longest_equal_block = _op_length(element)
if length_of_equal_blocks >= MIN_BLOCK_SIZE or length_of_longest_equal_block > len(small) * 2 // 3:
return True
return song1, song2
# Both checks failed. We assume the songs are not equal.
return False
return None
def _op_length(opcode):