Second attempt to fix duplicate-song-detection on windows

This commit is contained in:
Tomas Groth 2014-11-05 21:44:45 +00:00
parent 02a159bf65
commit 992ac3bbb8
2 changed files with 16 additions and 15 deletions

View File

@ -33,6 +33,7 @@ The duplicate song removal logic for OpenLP.
import logging import logging
import multiprocessing import multiprocessing
import os import os
import functools
from PyQt4 import QtCore, QtGui from PyQt4 import QtCore, QtGui
@ -46,17 +47,16 @@ from openlp.plugins.songs.lib.songcompare import songs_probably_equal
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def song_generator(songs): def tuple_generator(number_of_songs):
""" """
This is a generator function to return tuples of tuple with two songs and their position in the song array. This is a generator function to return tuples of two songs position. When completed then all songs position have
When completed then all songs have once been returned combined with any other songs. once been returned combined with any other songs position.
:param songs: All songs in the database. :param number_of_songs: Number of songs in the DB.
""" """
for outer_song_counter in range(len(songs) - 1): for outer_song_counter in range(number_of_songs - 1):
for inner_song_counter in range(outer_song_counter + 1, len(songs)): for inner_song_counter in range(outer_song_counter + 1, number_of_songs):
yield ((outer_song_counter, songs[outer_song_counter].search_lyrics), yield (outer_song_counter, inner_song_counter)
(inner_song_counter, songs[inner_song_counter].search_lyrics))
class DuplicateSongRemovalForm(OpenLPWizard, RegistryProperties): class DuplicateSongRemovalForm(OpenLPWizard, RegistryProperties):
@ -184,7 +184,9 @@ class DuplicateSongRemovalForm(OpenLPWizard, RegistryProperties):
# Create a worker/process pool to check the songs. # Create a worker/process pool to check the songs.
process_number = max(1, multiprocessing.cpu_count() - 1) process_number = max(1, multiprocessing.cpu_count() - 1)
pool = multiprocessing.Pool(process_number) pool = multiprocessing.Pool(process_number)
result = pool.imap_unordered(songs_probably_equal, song_generator(songs), 30) # Create array with all lyrics
song_lyrics = [song.search_lyrics for song in songs]
result = pool.imap_unordered(functools.partial(songs_probably_equal, song_lyrics), tuple_generator(len(songs)), 30)
# Do not accept any further tasks. Also this closes the processes if all tasks are done. # Do not accept any further tasks. Also this closes the processes if all tasks are done.
pool.close() pool.close()
# While the processes are still working, start to look at the results. # While the processes are still working, start to look at the results.

View File

@ -52,15 +52,14 @@ MIN_BLOCK_SIZE = 70
MAX_TYPO_SIZE = 3 MAX_TYPO_SIZE = 3
def songs_probably_equal(song_tupel): def songs_probably_equal(songs, pos_tupel):
""" """
Calculate and return whether two songs are probably equal. Calculate and return whether two songs are probably equal.
:param song_tupel: A tuple of two songs to compare. :param song_tupel: A tuple of two songs to compare.
""" """
song1, song2 = song_tupel lyrics1 = songs[pos_tupel[0]]
pos1, lyrics1 = song1 lyrics2 = songs[pos_tupel[1]]
pos2, lyrics2 = song2
if len(lyrics1) < len(lyrics2): if len(lyrics1) < len(lyrics2):
small = lyrics1 small = lyrics1
large = lyrics2 large = lyrics2
@ -79,7 +78,7 @@ def songs_probably_equal(song_tupel):
length_of_equal_blocks += _op_length(element) length_of_equal_blocks += _op_length(element)
if length_of_equal_blocks >= MIN_BLOCK_SIZE: if length_of_equal_blocks >= MIN_BLOCK_SIZE:
return pos1, pos2 return pos_tupel[0], pos_tupel[1]
# Check 2: Similarity based on the relative length of the longest equal block. # Check 2: Similarity based on the relative length of the longest equal block.
# Calculate the length of the largest equal block of the diff set. # Calculate the length of the largest equal block of the diff set.
length_of_longest_equal_block = 0 length_of_longest_equal_block = 0
@ -87,7 +86,7 @@ def songs_probably_equal(song_tupel):
if element[0] == "equal" and _op_length(element) > length_of_longest_equal_block: if element[0] == "equal" and _op_length(element) > length_of_longest_equal_block:
length_of_longest_equal_block = _op_length(element) length_of_longest_equal_block = _op_length(element)
if length_of_longest_equal_block > len(small) * 2 // 3: if length_of_longest_equal_block > len(small) * 2 // 3:
return pos1, pos2 return pos_tupel[0], pos_tupel[1]
# Both checks failed. We assume the songs are not equal. # Both checks failed. We assume the songs are not equal.
return None return None