diff --git a/openlp/plugins/songs/lib/songcompare.py b/openlp/plugins/songs/lib/songcompare.py new file mode 100644 index 000000000..e230d9f53 --- /dev/null +++ b/openlp/plugins/songs/lib/songcompare.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- +# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4 + +############################################################################### +# OpenLP - Open Source Lyrics Projection # +# --------------------------------------------------------------------------- # +# Copyright (c) 2008-2013 Raoul Snyman # +# Portions copyright (c) 2008-2013 Tim Bentley, Gerald Britton, Jonathan # +# Corwin, Samuel Findlay, Michael Gorven, Scott Guerrieri, Matthias Hub, # +# Meinert Jordan, Armin Köhler, Erik Lundin, Edwin Lunando, Brian T. Meyer. # +# Joshua Miller, Stevan Pettit, Andreas Preikschat, Mattias Põldaru, # +# Christian Richter, Philip Ridout, Simon Scudder, Jeffrey Smith, # +# Maikel Stuivenberg, Martin Thompson, Jon Tibble, Dave Warnock, # +# Frode Woldsund, Martin Zibricky, Patrick Zimmermann # +# --------------------------------------------------------------------------- # +# This program is free software; you can redistribute it and/or modify it # +# under the terms of the GNU General Public License as published by the Free # +# Software Foundation; version 2 of the License. # +# # +# This program is distributed in the hope that it will be useful, but WITHOUT # +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # +# more details. # +# # +# You should have received a copy of the GNU General Public License along # +# with this program; if not, write to the Free Software Foundation, Inc., 59 # +# Temple Place, Suite 330, Boston, MA 02111-1307 USA # +############################################################################### +""" +The :mod:`songcompare` module provides functionality to search for +duplicate songs. It has one single :function:`songs_probably_equal`. + +The algorithm is based on the diff algorithm. +First a diffset is calculated for two songs. +To compensate for typos all differences that are smaller than a +limit (min_fragment_size) are removed and the surrounding equal parts are merged. +Finally two conditions can qualify a song tuple to be a duplicate: +1. There is a block of equal content that is at least min_block_size large. + This condition should hit for all larger songs that have a long enough + equal part. Even if only one verse is equal this condition should still hit. +2. Two thirds of the smaller song is contained in the larger song. + This condition should hit if one of the two songs (or both) is small (smaller + than the min_block_size), but most of the song is contained in the other song. +""" +import difflib + + +min_fragment_size = 5 +min_block_size = 70 +max_typo_size = 3 + + +def songs_probably_equal(song1, song2): + """ + Calculate and return whether two songs are probably equal. + + ``song1`` + The first song to compare. + + ``song2`` + The second song to compare. + """ + if len(song1.search_lyrics) < len(song2.search_lyrics): + small = song1.search_lyrics + large = song2.search_lyrics + else: + small = song2.search_lyrics + large = song1.search_lyrics + differ = difflib.SequenceMatcher(a=large, b=small) + diff_tuples = differ.get_opcodes() + diff_no_typos = __remove_typos(diff_tuples) + if __length_of_equal_blocks(diff_no_typos) >= min_block_size or \ + __length_of_longest_equal_block(diff_no_typos) > len(small) * 2 / 3: + return True + else: + return False + + +def __op_length(opcode): + """ + Return the length of a given difference. + + ``opcode`` + The difference. + """ + return max(opcode[2] - opcode[1], opcode[4] - opcode[3]) + + +def __remove_typos(diff): + """ + Remove typos from a diff set. A typo is a small difference (min_fragment_size). + + ``diff`` + The diff set to remove the typos from. + """ + # Remove typo at beginning of the string. + if len(diff) >= 2: + if diff[0][0] != "equal" and __op_length(diff[0]) <= max_typo_size and \ + __op_length(diff[1]) >= min_fragment_size: + del diff[0] + # Remove typos in the middle of the string. + if len(diff) >= 3: + for index in range(len(diff) - 3, -1, -1): + if __op_length(diff[index]) >= min_fragment_size and \ + diff[index + 1][0] != "equal" and __op_length(diff[index + 1]) <= max_typo_size and \ + __op_length(diff[index + 2]) >= min_fragment_size: + del diff[index + 1] + # Remove typo at the end of the string. + if len(diff) >= 2: + if __op_length(diff[-2]) >= min_fragment_size and \ + diff[-1][0] != "equal" and __op_length(diff[-1]) <= max_typo_size: + del diff[-1] + + # Merge the bordering equal passages that occured by removing differences. + for index in range(len(diff) - 2, -1, -1): + if diff[index][0] == "equal" and __op_length(diff[index]) >= min_fragment_size and \ + diff[index + 1][0] == "equal" and __op_length(diff[index + 1]) >= min_fragment_size: + diff[index] = ("equal", diff[index][1], diff[index + 1][2], diff[index][3], + diff[index + 1][4]) + del diff[index + 1] + + return diff + + +def __length_of_equal_blocks(diff): + """ + Return the total length of all equal blocks in a diff set. + Blocks smaller than min_block_size are not counted. + + ``diff`` + The diff set to return the length for. + """ + length = 0 + for element in diff: + if element[0] == "equal" and __op_length(element) >= min_block_size: + length += __op_length(element) + return length + + +def __length_of_longest_equal_block(diff): + """ + Return the length of the largest equal block in a diff set. + + ``diff`` + The diff set to return the length for. + """ + length = 0 + for element in diff: + if element[0] == "equal" and __op_length(element) > length: + length = __op_length(element) + return length diff --git a/tests/functional/openlp_plugins/songs/__init__.py b/tests/functional/openlp_plugins/songs/__init__.py deleted file mode 100644 index 0157fb2f0..000000000 --- a/tests/functional/openlp_plugins/songs/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -import sip -sip.setapi(u'QDate', 2) -sip.setapi(u'QDateTime', 2) -sip.setapi(u'QString', 2) -sip.setapi(u'QTextStream', 2) -sip.setapi(u'QTime', 2) -sip.setapi(u'QUrl', 2) -sip.setapi(u'QVariant', 2) diff --git a/tests/functional/openlp_plugins/songs/test_lib.py b/tests/functional/openlp_plugins/songs/test_lib.py index 9f04e7350..a02563b22 100644 --- a/tests/functional/openlp_plugins/songs/test_lib.py +++ b/tests/functional/openlp_plugins/songs/test_lib.py @@ -34,64 +34,82 @@ from mock import MagicMock from openlp.plugins.songs.lib.songcompare import songs_probably_equal class TestLib(TestCase): + def setUp(self): + """ + Mock up two songs and provide a set of lyrics for the songs_probably_equal tests. + """ + self.full_lyrics =u'''amazing grace how sweet the sound that saved a wretch like me i once was lost but now am + found was blind but now i see twas grace that taught my heart to fear and grace my fears relieved how + precious did that grace appear the hour i first believed through many dangers toils and snares i have already + come tis grace that brought me safe thus far and grace will lead me home''' + self.short_lyrics =u'''twas grace that taught my heart to fear and grace my fears relieved how precious did that + grace appear the hour i first believed''' + self.error_lyrics =u'''amazing how sweet the trumpet that saved a wrench like me i once was losst but now am + found waf blind but now i see it was grace that taught my heart to fear and grace my fears relieved how + precious did that grace appppppppear the hour i first believedxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx snares i have + already come to this grace that brought me safe so far and grace will lead me home''' + self.different_lyrics=u'''on a hill far away stood an old rugged cross the emblem of suffering and shame and i love + that old cross where the dearest and best for a world of lost sinners was slain so ill cherish the old rugged + cross till my trophies at last i lay down i will cling to the old rugged cross and exchange it some day for a + crown''' + self.song1 = MagicMock() + self.song2 = MagicMock() - def songs_probably_equal_test(self): + def songs_probably_equal_same_song_test(self): """ - Test the songs_probably_equal function. + Test the songs_probably_equal function with twice the same song. """ - full_lyrics =u'''amazing grace how sweet the sound that saved a wretch like me i once was lost but now am - found was blind but now i see twas grace that taught my heart to fear and grace my fears relieved how - precious did that grace appear the hour i first believed through many dangers toils and snares i have already - come tis grace that brought me safe thus far and grace will lead me home''' - short_lyrics =u'''twas grace that taught my heart to fear and grace my fears relieved how precious did that - grace appear the hour i first believed''' - error_lyrics =u'''amazing how sweet the trumpet that saved a wrench like me i once was losst but now am - found waf blind but now i see it was grace that taught my heart to fear and grace my fears relieved how - precious did that grace appppppppear the hour i first believedxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx snares i have - already come to this grace that brought me safe so far and grace will lead me home''' - different_lyrics=u'''on a hill far away stood an old rugged cross the emblem of suffering and shame and i love - that old cross where the dearest and best for a world of lost sinners was slain so ill cherish the old rugged - cross till my trophies at last i lay down i will cling to the old rugged cross and exchange it some day for a - crown''' - song1 = MagicMock() - song2 = MagicMock() - #GIVEN: Two equal songs. - song1.search_lyrics = full_lyrics - song2.search_lyrics = full_lyrics + self.song1.search_lyrics = self.full_lyrics + self.song2.search_lyrics = self.full_lyrics #WHEN: We compare those songs for equality. - result = songs_probably_equal(song1, song2) + result = songs_probably_equal(self.song1, self.song2) #THEN: The result should be True. assert result is True, u'The result should be True' - + + + def songs_probably_equal_short_song_test(self): + """ + Test the songs_probably_equal function with a song and a shorter version of the same song. + """ #GIVEN: A song and a short version of the same song. - song1.search_lyrics = full_lyrics - song2.search_lyrics = short_lyrics + self.song1.search_lyrics = self.full_lyrics + self.song2.search_lyrics = self.short_lyrics #WHEN: We compare those songs for equality. - result = songs_probably_equal(song1, song2) + result = songs_probably_equal(self.song1, self.song2) #THEN: The result should be True. assert result is True, u'The result should be True' - + + + def songs_probably_equal_error_song_test(self): + """ + Test the songs_probably_equal function with a song and a very erroneous version of the same song. + """ #GIVEN: A song and the same song with lots of errors. - song1.search_lyrics = full_lyrics - song2.search_lyrics = error_lyrics + self.song1.search_lyrics = self.full_lyrics + self.song2.search_lyrics = self.error_lyrics #WHEN: We compare those songs for equality. - result = songs_probably_equal(song1, song2) + result = songs_probably_equal(self.song1, self.song2) #THEN: The result should be True. assert result is True, u'The result should be True' - + + + def songs_probably_equal_different_song_test(self): + """ + Test the songs_probably_equal function with two different songs. + """ #GIVEN: Two different songs. - song1.search_lyrics = full_lyrics - song2.search_lyrics = different_lyrics + self.song1.search_lyrics = self.full_lyrics + self.song2.search_lyrics = self.different_lyrics #WHEN: We compare those songs for equality. - result = songs_probably_equal(song1, song2) + result = songs_probably_equal(self.song1, self.song2) #THEN: The result should be False. assert result is False, u'The result should be False'