imported duplicate check speed

This commit is contained in:
Andreas Preikschat 2014-03-21 21:04:53 +01:00
parent ff78a99fc1
commit ed29edec3a
2 changed files with 44 additions and 21 deletions

View File

@ -31,6 +31,7 @@ The duplicate song removal logic for OpenLP.
""" """
import logging import logging
import multiprocessing
import os import os
from PyQt4 import QtCore, QtGui from PyQt4 import QtCore, QtGui
@ -45,6 +46,17 @@ from openlp.plugins.songs.lib.songcompare import songs_probably_equal
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class SongIterator(object):
def __init__(self, songs):
self.songs = songs
def __iter__(self):
for outer_song_counter in range(len(self.songs) - 1):
for inner_song_counter in range(outer_song_counter + 1, len(self.songs)):
yield (self.songs[outer_song_counter], self.songs[inner_song_counter])
class DuplicateSongRemovalForm(OpenLPWizard): class DuplicateSongRemovalForm(OpenLPWizard):
""" """
This is the Duplicate Song Removal Wizard. It provides functionality to search for and remove duplicate songs This is the Duplicate Song Removal Wizard. It provides functionality to search for and remove duplicate songs
@ -167,24 +179,32 @@ class DuplicateSongRemovalForm(OpenLPWizard):
max_progress_count = max_songs * (max_songs - 1) // 2 max_progress_count = max_songs * (max_songs - 1) // 2
self.duplicate_search_progress_bar.setMaximum(max_progress_count) self.duplicate_search_progress_bar.setMaximum(max_progress_count)
songs = self.plugin.manager.get_all_objects(Song) songs = self.plugin.manager.get_all_objects(Song)
for outer_song_counter in range(max_songs - 1): # Create a worker/process pool to check the songs.
for inner_song_counter in range(outer_song_counter + 1, max_songs): process_number = max(1, multiprocessing.cpu_count() - 1)
if songs_probably_equal(songs[outer_song_counter], songs[inner_song_counter]): pool = multiprocessing.Pool(process_number)
duplicate_added = self.add_duplicates_to_song_list( song_list = SongIterator(songs)
songs[outer_song_counter], songs[inner_song_counter]) #song_list = [(songs[outer_song_counter], songs[inner_song_counter]) for outer_song_counter in range(max_songs - 1) for inner_song_counter in range(outer_song_counter + 1, max_songs)]
if duplicate_added: result = pool.imap_unordered(songs_probably_equal, song_list, 30)
self.found_duplicates_edit.appendPlainText( # Do not accept any further tasks. Also this closes the processes if all tasks are done.
songs[outer_song_counter].title + " = " + songs[inner_song_counter].title) pool.close()
# While the processes are still working, start to look at the results.
for song_tuple in result:
self.duplicate_search_progress_bar.setValue(self.duplicate_search_progress_bar.value() + 1) self.duplicate_search_progress_bar.setValue(self.duplicate_search_progress_bar.value() + 1)
# The call to process_events() will keep the GUI responsive. # The call to process_events() will keep the GUI responsive.
self.application.process_events() self.application.process_events()
if self.break_search: if self.break_search:
pool.terminate()
return return
self.review_total_count = len(self.duplicate_song_list) if song_tuple is None:
if self.review_total_count == 0: continue
self.notify_no_duplicates() song1, song2 = song_tuple
else: duplicate_added = self.add_duplicates_to_song_list(song1, song2)
if duplicate_added:
self.found_duplicates_edit.appendPlainText(song1.title + " = " + song2.title)
if self.duplicate_song_list:
self.button(QtGui.QWizard.NextButton).show() self.button(QtGui.QWizard.NextButton).show()
else:
self.notify_no_duplicates()
finally: finally:
self.application.set_normal_cursor() self.application.set_normal_cursor()
elif page_id == self.review_page_id: elif page_id == self.review_page_id:

View File

@ -52,13 +52,15 @@ MIN_BLOCK_SIZE = 70
MAX_TYPO_SIZE = 3 MAX_TYPO_SIZE = 3
def songs_probably_equal(song1, song2): def songs_probably_equal(song1, song2=None):
""" """
Calculate and return whether two songs are probably equal. Calculate and return whether two songs are probably equal.
:param song1: The first song to compare. :param song1: The first song to compare.
:param song2: The second song to compare. :param song2: The second song to compare.
""" """
if song2 is None:
song1, song2 = song1
if len(song1.search_lyrics) < len(song2.search_lyrics): if len(song1.search_lyrics) < len(song2.search_lyrics):
small = song1.search_lyrics small = song1.search_lyrics
large = song2.search_lyrics large = song2.search_lyrics
@ -75,8 +77,9 @@ def songs_probably_equal(song1, song2):
for element in diff_no_typos: for element in diff_no_typos:
if element[0] == "equal" and _op_length(element) >= MIN_BLOCK_SIZE: if element[0] == "equal" and _op_length(element) >= MIN_BLOCK_SIZE:
length_of_equal_blocks += _op_length(element) length_of_equal_blocks += _op_length(element)
if length_of_equal_blocks >= MIN_BLOCK_SIZE: if length_of_equal_blocks >= MIN_BLOCK_SIZE:
return True return song1, song2
# Check 2: Similarity based on the relative length of the longest equal block. # Check 2: Similarity based on the relative length of the longest equal block.
# Calculate the length of the largest equal block of the diff set. # Calculate the length of the largest equal block of the diff set.
length_of_longest_equal_block = 0 length_of_longest_equal_block = 0
@ -84,9 +87,9 @@ def songs_probably_equal(song1, song2):
if element[0] == "equal" and _op_length(element) > length_of_longest_equal_block: if element[0] == "equal" and _op_length(element) > length_of_longest_equal_block:
length_of_longest_equal_block = _op_length(element) length_of_longest_equal_block = _op_length(element)
if length_of_equal_blocks >= MIN_BLOCK_SIZE or length_of_longest_equal_block > len(small) * 2 // 3: if length_of_equal_blocks >= MIN_BLOCK_SIZE or length_of_longest_equal_block > len(small) * 2 // 3:
return True return song1, song2
# Both checks failed. We assume the songs are not equal. # Both checks failed. We assume the songs are not equal.
return False return None
def _op_length(opcode): def _op_length(opcode):