Replace DuplicateSongFinder class with a set of functions.

This commit is contained in:
Patrick Zimmermann 2013-02-18 21:27:11 +01:00
parent 570b8e1f45
commit 8c8cd3b867
3 changed files with 10 additions and 168 deletions

View File

@ -29,20 +29,17 @@
""" """
The duplicate song removal logic for OpenLP. The duplicate song removal logic for OpenLP.
""" """
import codecs
import logging import logging
import os import os
from PyQt4 import QtCore, QtGui from PyQt4 import QtCore, QtGui
from openlp.core.lib import Registry, translate, build_icon from openlp.core.lib import Registry, translate
from openlp.core.lib.db import Manager
from openlp.core.lib.ui import UiStrings, critical_error_message_box
from openlp.core.ui.wizard import OpenLPWizard, WizardStrings from openlp.core.ui.wizard import OpenLPWizard, WizardStrings
from openlp.core.utils import AppLocation from openlp.core.utils import AppLocation
from openlp.plugins.songs.lib.db import Song, MediaFile from openlp.plugins.songs.lib.db import Song, MediaFile
from openlp.plugins.songs.lib.duplicatesongfinder import DuplicateSongFinder
from openlp.plugins.songs.forms.songreviewwidget import SongReviewWidget from openlp.plugins.songs.forms.songreviewwidget import SongReviewWidget
from openlp.plugins.songs.lib.songcompare import songs_probably_equal
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -169,8 +166,7 @@ class DuplicateSongRemovalForm(OpenLPWizard):
songs = self.plugin.manager.get_all_objects(Song) songs = self.plugin.manager.get_all_objects(Song)
for outer_song_counter in range(max_songs - 1): for outer_song_counter in range(max_songs - 1):
for inner_song_counter in range(outer_song_counter + 1, max_songs): for inner_song_counter in range(outer_song_counter + 1, max_songs):
double_finder = DuplicateSongFinder() if songs_probably_equal(songs[outer_song_counter], songs[inner_song_counter]):
if double_finder.songs_probably_equal(songs[outer_song_counter], songs[inner_song_counter]):
duplicate_added = self.add_duplicates_to_song_list(songs[outer_song_counter], duplicate_added = self.add_duplicates_to_song_list(songs[outer_song_counter],
songs[inner_song_counter]) songs[inner_song_counter])
if duplicate_added: if duplicate_added:

View File

@ -1,153 +0,0 @@
# -*- coding: utf-8 -*-
# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4
###############################################################################
# OpenLP - Open Source Lyrics Projection #
# --------------------------------------------------------------------------- #
# Copyright (c) 2008-2013 Raoul Snyman #
# Portions copyright (c) 2008-2013 Tim Bentley, Gerald Britton, Jonathan #
# Corwin, Samuel Findlay, Michael Gorven, Scott Guerrieri, Matthias Hub, #
# Meinert Jordan, Armin Köhler, Erik Lundin, Edwin Lunando, Brian T. Meyer. #
# Joshua Miller, Stevan Pettit, Andreas Preikschat, Mattias Põldaru, #
# Christian Richter, Philip Ridout, Simon Scudder, Jeffrey Smith, #
# Maikel Stuivenberg, Martin Thompson, Jon Tibble, Dave Warnock, #
# Frode Woldsund, Martin Zibricky, Patrick Zimmermann #
# --------------------------------------------------------------------------- #
# This program is free software; you can redistribute it and/or modify it #
# under the terms of the GNU General Public License as published by the Free #
# Software Foundation; version 2 of the License. #
# #
# This program is distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
# more details. #
# #
# You should have received a copy of the GNU General Public License along #
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
###############################################################################
import difflib
from openlp.plugins.songs.lib.db import Song
class DuplicateSongFinder(object):
"""
The :class:`DuplicateSongFinder` class provides functionality to search for
duplicate songs.
The algorithm is based on the diff algorithm.
First a diffset is calculated for two songs.
To compensate for typos all differences that are smaller than a
limit (<max_typo_size) and are surrounded by larger equal blocks
(>min_fragment_size) are removed and the surrounding equal parts are merged.
Finally two conditions can qualify a song tuple to be a duplicate:
1. There is a block of equal content that is at least min_block_size large.
This condition should hit for all larger songs that have a long enough
equal part. Even if only one verse is equal this condition should still hit.
2. Two thirds of the smaller song is contained in the larger song.
This condition should hit if one of the two songs (or both) is small (smaller
than the min_block_size), but most of the song is contained in the other song.
"""
def __init__(self):
self.min_fragment_size = 5
self.min_block_size = 70
self.max_typo_size = 3
def songs_probably_equal(self, song1, song2):
"""
Calculate and return whether two songs are probably equal.
``song1``
The first song to compare.
``song2``
The second song to compare.
"""
if len(song1.search_lyrics) < len(song2.search_lyrics):
small = song1.search_lyrics
large = song2.search_lyrics
else:
small = song2.search_lyrics
large = song1.search_lyrics
differ = difflib.SequenceMatcher(a=large, b=small)
diff_tuples = differ.get_opcodes()
diff_no_typos = self.__remove_typos(diff_tuples)
if self.__length_of_equal_blocks(diff_no_typos) >= self.min_block_size or \
self.__length_of_longest_equal_block(diff_no_typos) > len(small) * 2 / 3:
return True
else:
return False
def __op_length(self, opcode):
"""
Return the length of a given difference.
``opcode``
The difference.
"""
return max(opcode[2] - opcode[1], opcode[4] - opcode[3])
def __remove_typos(self, diff):
"""
Remove typos from a diff set. A typo is a small difference (<max_typo_size)
surrounded by larger equal passages (>min_fragment_size).
``diff``
The diff set to remove the typos from.
"""
# Remove typo at beginning of the string.
if len(diff) >= 2:
if diff[0][0] != "equal" and self.__op_length(diff[0]) <= self.max_typo_size and \
self.__op_length(diff[1]) >= self.min_fragment_size:
del diff[0]
# Remove typos in the middle of the string.
if len(diff) >= 3:
for index in range(len(diff) - 3, -1, -1):
if self.__op_length(diff[index]) >= self.min_fragment_size and \
diff[index + 1][0] != "equal" and self.__op_length(diff[index + 1]) <= self.max_typo_size and \
self.__op_length(diff[index + 2]) >= self.min_fragment_size:
del diff[index + 1]
# Remove typo at the end of the string.
if len(diff) >= 2:
if self.__op_length(diff[-2]) >= self.min_fragment_size and \
diff[-1][0] != "equal" and self.__op_length(diff[-1]) <= self.max_typo_size:
del diff[-1]
# Merge the bordering equal passages that occured by removing differences.
for index in range(len(diff) - 2, -1, -1):
if diff[index][0] == "equal" and self.__op_length(diff[index]) >= self.min_fragment_size and \
diff[index + 1][0] == "equal" and self.__op_length(diff[index + 1]) >= self.min_fragment_size:
diff[index] = ("equal", diff[index][1], diff[index + 1][2], diff[index][3],
diff[index + 1][4])
del diff[index + 1]
return diff
def __length_of_equal_blocks(self, diff):
"""
Return the total length of all equal blocks in a diff set.
Blocks smaller than min_block_size are not counted.
``diff``
The diff set to return the length for.
"""
length = 0
for element in diff:
if element[0] == "equal" and self.__op_length(element) >= self.min_block_size:
length += self.__op_length(element)
return length
def __length_of_longest_equal_block(self, diff):
"""
Return the length of the largest equal block in a diff set.
``diff``
The diff set to return the length for.
"""
length = 0
for element in diff:
if element[0] == "equal" and self.__op_length(element) > length:
length = self.__op_length(element)
return length

View File

@ -31,13 +31,13 @@ from unittest import TestCase
from mock import MagicMock from mock import MagicMock
from openlp.plugins.songs.lib.duplicatesongfinder import DuplicateSongFinder from openlp.plugins.songs.lib.songcompare import songs_probably_equal
class TestLib(TestCase): class TestLib(TestCase):
def songs_probably_equal_test(self): def songs_probably_equal_test(self):
""" """
Test the DuplicateSongFinder.songs_probably_equal function. Test the songs_probably_equal function.
""" """
full_lyrics =u'''amazing grace how sweet the sound that saved a wretch like me i once was lost but now am full_lyrics =u'''amazing grace how sweet the sound that saved a wretch like me i once was lost but now am
found was blind but now i see twas grace that taught my heart to fear and grace my fears relieved how found was blind but now i see twas grace that taught my heart to fear and grace my fears relieved how
@ -53,7 +53,6 @@ class TestLib(TestCase):
that old cross where the dearest and best for a world of lost sinners was slain so ill cherish the old rugged that old cross where the dearest and best for a world of lost sinners was slain so ill cherish the old rugged
cross till my trophies at last i lay down i will cling to the old rugged cross and exchange it some day for a cross till my trophies at last i lay down i will cling to the old rugged cross and exchange it some day for a
crown''' crown'''
duplicate_song_finder = DuplicateSongFinder()
song1 = MagicMock() song1 = MagicMock()
song2 = MagicMock() song2 = MagicMock()
@ -62,7 +61,7 @@ class TestLib(TestCase):
song2.search_lyrics = full_lyrics song2.search_lyrics = full_lyrics
#WHEN: We compare those songs for equality. #WHEN: We compare those songs for equality.
result = duplicate_song_finder.songs_probably_equal(song1, song2) result = songs_probably_equal(song1, song2)
#THEN: The result should be True. #THEN: The result should be True.
assert result is True, u'The result should be True' assert result is True, u'The result should be True'
@ -72,7 +71,7 @@ class TestLib(TestCase):
song2.search_lyrics = short_lyrics song2.search_lyrics = short_lyrics
#WHEN: We compare those songs for equality. #WHEN: We compare those songs for equality.
result = duplicate_song_finder.songs_probably_equal(song1, song2) result = songs_probably_equal(song1, song2)
#THEN: The result should be True. #THEN: The result should be True.
assert result is True, u'The result should be True' assert result is True, u'The result should be True'
@ -82,7 +81,7 @@ class TestLib(TestCase):
song2.search_lyrics = error_lyrics song2.search_lyrics = error_lyrics
#WHEN: We compare those songs for equality. #WHEN: We compare those songs for equality.
result = duplicate_song_finder.songs_probably_equal(song1, song2) result = songs_probably_equal(song1, song2)
#THEN: The result should be True. #THEN: The result should be True.
assert result is True, u'The result should be True' assert result is True, u'The result should be True'
@ -92,7 +91,7 @@ class TestLib(TestCase):
song2.search_lyrics = different_lyrics song2.search_lyrics = different_lyrics
#WHEN: We compare those songs for equality. #WHEN: We compare those songs for equality.
result = duplicate_song_finder.songs_probably_equal(song1, song2) result = songs_probably_equal(song1, song2)
#THEN: The result should be False. #THEN: The result should be False.
assert result is False, u'The result should be False' assert result is False, u'The result should be False'