forked from openlp/openlp
imported duplicate check speed
This commit is contained in:
parent
ff78a99fc1
commit
ed29edec3a
@ -31,6 +31,7 @@ The duplicate song removal logic for OpenLP.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from PyQt4 import QtCore, QtGui
|
from PyQt4 import QtCore, QtGui
|
||||||
@ -45,6 +46,17 @@ from openlp.plugins.songs.lib.songcompare import songs_probably_equal
|
|||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SongIterator(object):
|
||||||
|
def __init__(self, songs):
|
||||||
|
self.songs = songs
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for outer_song_counter in range(len(self.songs) - 1):
|
||||||
|
for inner_song_counter in range(outer_song_counter + 1, len(self.songs)):
|
||||||
|
yield (self.songs[outer_song_counter], self.songs[inner_song_counter])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class DuplicateSongRemovalForm(OpenLPWizard):
|
class DuplicateSongRemovalForm(OpenLPWizard):
|
||||||
"""
|
"""
|
||||||
This is the Duplicate Song Removal Wizard. It provides functionality to search for and remove duplicate songs
|
This is the Duplicate Song Removal Wizard. It provides functionality to search for and remove duplicate songs
|
||||||
@ -167,24 +179,32 @@ class DuplicateSongRemovalForm(OpenLPWizard):
|
|||||||
max_progress_count = max_songs * (max_songs - 1) // 2
|
max_progress_count = max_songs * (max_songs - 1) // 2
|
||||||
self.duplicate_search_progress_bar.setMaximum(max_progress_count)
|
self.duplicate_search_progress_bar.setMaximum(max_progress_count)
|
||||||
songs = self.plugin.manager.get_all_objects(Song)
|
songs = self.plugin.manager.get_all_objects(Song)
|
||||||
for outer_song_counter in range(max_songs - 1):
|
# Create a worker/process pool to check the songs.
|
||||||
for inner_song_counter in range(outer_song_counter + 1, max_songs):
|
process_number = max(1, multiprocessing.cpu_count() - 1)
|
||||||
if songs_probably_equal(songs[outer_song_counter], songs[inner_song_counter]):
|
pool = multiprocessing.Pool(process_number)
|
||||||
duplicate_added = self.add_duplicates_to_song_list(
|
song_list = SongIterator(songs)
|
||||||
songs[outer_song_counter], songs[inner_song_counter])
|
#song_list = [(songs[outer_song_counter], songs[inner_song_counter]) for outer_song_counter in range(max_songs - 1) for inner_song_counter in range(outer_song_counter + 1, max_songs)]
|
||||||
if duplicate_added:
|
result = pool.imap_unordered(songs_probably_equal, song_list, 30)
|
||||||
self.found_duplicates_edit.appendPlainText(
|
# Do not accept any further tasks. Also this closes the processes if all tasks are done.
|
||||||
songs[outer_song_counter].title + " = " + songs[inner_song_counter].title)
|
pool.close()
|
||||||
|
# While the processes are still working, start to look at the results.
|
||||||
|
for song_tuple in result:
|
||||||
self.duplicate_search_progress_bar.setValue(self.duplicate_search_progress_bar.value() + 1)
|
self.duplicate_search_progress_bar.setValue(self.duplicate_search_progress_bar.value() + 1)
|
||||||
# The call to process_events() will keep the GUI responsive.
|
# The call to process_events() will keep the GUI responsive.
|
||||||
self.application.process_events()
|
self.application.process_events()
|
||||||
if self.break_search:
|
if self.break_search:
|
||||||
|
pool.terminate()
|
||||||
return
|
return
|
||||||
self.review_total_count = len(self.duplicate_song_list)
|
if song_tuple is None:
|
||||||
if self.review_total_count == 0:
|
continue
|
||||||
self.notify_no_duplicates()
|
song1, song2 = song_tuple
|
||||||
else:
|
duplicate_added = self.add_duplicates_to_song_list(song1, song2)
|
||||||
|
if duplicate_added:
|
||||||
|
self.found_duplicates_edit.appendPlainText(song1.title + " = " + song2.title)
|
||||||
|
if self.duplicate_song_list:
|
||||||
self.button(QtGui.QWizard.NextButton).show()
|
self.button(QtGui.QWizard.NextButton).show()
|
||||||
|
else:
|
||||||
|
self.notify_no_duplicates()
|
||||||
finally:
|
finally:
|
||||||
self.application.set_normal_cursor()
|
self.application.set_normal_cursor()
|
||||||
elif page_id == self.review_page_id:
|
elif page_id == self.review_page_id:
|
||||||
|
@ -52,13 +52,15 @@ MIN_BLOCK_SIZE = 70
|
|||||||
MAX_TYPO_SIZE = 3
|
MAX_TYPO_SIZE = 3
|
||||||
|
|
||||||
|
|
||||||
def songs_probably_equal(song1, song2):
|
def songs_probably_equal(song1, song2=None):
|
||||||
"""
|
"""
|
||||||
Calculate and return whether two songs are probably equal.
|
Calculate and return whether two songs are probably equal.
|
||||||
|
|
||||||
:param song1: The first song to compare.
|
:param song1: The first song to compare.
|
||||||
:param song2: The second song to compare.
|
:param song2: The second song to compare.
|
||||||
"""
|
"""
|
||||||
|
if song2 is None:
|
||||||
|
song1, song2 = song1
|
||||||
if len(song1.search_lyrics) < len(song2.search_lyrics):
|
if len(song1.search_lyrics) < len(song2.search_lyrics):
|
||||||
small = song1.search_lyrics
|
small = song1.search_lyrics
|
||||||
large = song2.search_lyrics
|
large = song2.search_lyrics
|
||||||
@ -75,8 +77,9 @@ def songs_probably_equal(song1, song2):
|
|||||||
for element in diff_no_typos:
|
for element in diff_no_typos:
|
||||||
if element[0] == "equal" and _op_length(element) >= MIN_BLOCK_SIZE:
|
if element[0] == "equal" and _op_length(element) >= MIN_BLOCK_SIZE:
|
||||||
length_of_equal_blocks += _op_length(element)
|
length_of_equal_blocks += _op_length(element)
|
||||||
|
|
||||||
if length_of_equal_blocks >= MIN_BLOCK_SIZE:
|
if length_of_equal_blocks >= MIN_BLOCK_SIZE:
|
||||||
return True
|
return song1, song2
|
||||||
# Check 2: Similarity based on the relative length of the longest equal block.
|
# Check 2: Similarity based on the relative length of the longest equal block.
|
||||||
# Calculate the length of the largest equal block of the diff set.
|
# Calculate the length of the largest equal block of the diff set.
|
||||||
length_of_longest_equal_block = 0
|
length_of_longest_equal_block = 0
|
||||||
@ -84,9 +87,9 @@ def songs_probably_equal(song1, song2):
|
|||||||
if element[0] == "equal" and _op_length(element) > length_of_longest_equal_block:
|
if element[0] == "equal" and _op_length(element) > length_of_longest_equal_block:
|
||||||
length_of_longest_equal_block = _op_length(element)
|
length_of_longest_equal_block = _op_length(element)
|
||||||
if length_of_equal_blocks >= MIN_BLOCK_SIZE or length_of_longest_equal_block > len(small) * 2 // 3:
|
if length_of_equal_blocks >= MIN_BLOCK_SIZE or length_of_longest_equal_block > len(small) * 2 // 3:
|
||||||
return True
|
return song1, song2
|
||||||
# Both checks failed. We assume the songs are not equal.
|
# Both checks failed. We assume the songs are not equal.
|
||||||
return False
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _op_length(opcode):
|
def _op_length(opcode):
|
||||||
|
Loading…
Reference in New Issue
Block a user