forked from openlp/openlp
imported duplicate check speed
This commit is contained in:
parent
ff78a99fc1
commit
ed29edec3a
@ -31,6 +31,7 @@ The duplicate song removal logic for OpenLP.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
|
||||
from PyQt4 import QtCore, QtGui
|
||||
@ -45,6 +46,17 @@ from openlp.plugins.songs.lib.songcompare import songs_probably_equal
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SongIterator(object):
|
||||
def __init__(self, songs):
|
||||
self.songs = songs
|
||||
|
||||
def __iter__(self):
|
||||
for outer_song_counter in range(len(self.songs) - 1):
|
||||
for inner_song_counter in range(outer_song_counter + 1, len(self.songs)):
|
||||
yield (self.songs[outer_song_counter], self.songs[inner_song_counter])
|
||||
|
||||
|
||||
|
||||
class DuplicateSongRemovalForm(OpenLPWizard):
|
||||
"""
|
||||
This is the Duplicate Song Removal Wizard. It provides functionality to search for and remove duplicate songs
|
||||
@ -167,24 +179,32 @@ class DuplicateSongRemovalForm(OpenLPWizard):
|
||||
max_progress_count = max_songs * (max_songs - 1) // 2
|
||||
self.duplicate_search_progress_bar.setMaximum(max_progress_count)
|
||||
songs = self.plugin.manager.get_all_objects(Song)
|
||||
for outer_song_counter in range(max_songs - 1):
|
||||
for inner_song_counter in range(outer_song_counter + 1, max_songs):
|
||||
if songs_probably_equal(songs[outer_song_counter], songs[inner_song_counter]):
|
||||
duplicate_added = self.add_duplicates_to_song_list(
|
||||
songs[outer_song_counter], songs[inner_song_counter])
|
||||
if duplicate_added:
|
||||
self.found_duplicates_edit.appendPlainText(
|
||||
songs[outer_song_counter].title + " = " + songs[inner_song_counter].title)
|
||||
self.duplicate_search_progress_bar.setValue(self.duplicate_search_progress_bar.value() + 1)
|
||||
# The call to process_events() will keep the GUI responsive.
|
||||
self.application.process_events()
|
||||
if self.break_search:
|
||||
return
|
||||
self.review_total_count = len(self.duplicate_song_list)
|
||||
if self.review_total_count == 0:
|
||||
self.notify_no_duplicates()
|
||||
else:
|
||||
# Create a worker/process pool to check the songs.
|
||||
process_number = max(1, multiprocessing.cpu_count() - 1)
|
||||
pool = multiprocessing.Pool(process_number)
|
||||
song_list = SongIterator(songs)
|
||||
#song_list = [(songs[outer_song_counter], songs[inner_song_counter]) for outer_song_counter in range(max_songs - 1) for inner_song_counter in range(outer_song_counter + 1, max_songs)]
|
||||
result = pool.imap_unordered(songs_probably_equal, song_list, 30)
|
||||
# Do not accept any further tasks. Also this closes the processes if all tasks are done.
|
||||
pool.close()
|
||||
# While the processes are still working, start to look at the results.
|
||||
for song_tuple in result:
|
||||
self.duplicate_search_progress_bar.setValue(self.duplicate_search_progress_bar.value() + 1)
|
||||
# The call to process_events() will keep the GUI responsive.
|
||||
self.application.process_events()
|
||||
if self.break_search:
|
||||
pool.terminate()
|
||||
return
|
||||
if song_tuple is None:
|
||||
continue
|
||||
song1, song2 = song_tuple
|
||||
duplicate_added = self.add_duplicates_to_song_list(song1, song2)
|
||||
if duplicate_added:
|
||||
self.found_duplicates_edit.appendPlainText(song1.title + " = " + song2.title)
|
||||
if self.duplicate_song_list:
|
||||
self.button(QtGui.QWizard.NextButton).show()
|
||||
else:
|
||||
self.notify_no_duplicates()
|
||||
finally:
|
||||
self.application.set_normal_cursor()
|
||||
elif page_id == self.review_page_id:
|
||||
|
@ -52,13 +52,15 @@ MIN_BLOCK_SIZE = 70
|
||||
MAX_TYPO_SIZE = 3
|
||||
|
||||
|
||||
def songs_probably_equal(song1, song2):
|
||||
def songs_probably_equal(song1, song2=None):
|
||||
"""
|
||||
Calculate and return whether two songs are probably equal.
|
||||
|
||||
:param song1: The first song to compare.
|
||||
:param song2: The second song to compare.
|
||||
"""
|
||||
if song2 is None:
|
||||
song1, song2 = song1
|
||||
if len(song1.search_lyrics) < len(song2.search_lyrics):
|
||||
small = song1.search_lyrics
|
||||
large = song2.search_lyrics
|
||||
@ -75,8 +77,9 @@ def songs_probably_equal(song1, song2):
|
||||
for element in diff_no_typos:
|
||||
if element[0] == "equal" and _op_length(element) >= MIN_BLOCK_SIZE:
|
||||
length_of_equal_blocks += _op_length(element)
|
||||
|
||||
if length_of_equal_blocks >= MIN_BLOCK_SIZE:
|
||||
return True
|
||||
return song1, song2
|
||||
# Check 2: Similarity based on the relative length of the longest equal block.
|
||||
# Calculate the length of the largest equal block of the diff set.
|
||||
length_of_longest_equal_block = 0
|
||||
@ -84,9 +87,9 @@ def songs_probably_equal(song1, song2):
|
||||
if element[0] == "equal" and _op_length(element) > length_of_longest_equal_block:
|
||||
length_of_longest_equal_block = _op_length(element)
|
||||
if length_of_equal_blocks >= MIN_BLOCK_SIZE or length_of_longest_equal_block > len(small) * 2 // 3:
|
||||
return True
|
||||
return song1, song2
|
||||
# Both checks failed. We assume the songs are not equal.
|
||||
return False
|
||||
return None
|
||||
|
||||
|
||||
def _op_length(opcode):
|
||||
|
Loading…
Reference in New Issue
Block a user