From bc3f854921cbbc2e9c0eef2b7dcb3a6b1732e371 Mon Sep 17 00:00:00 2001 From: Patrick Zimmermann Date: Tue, 22 Jan 2013 22:11:01 +0100 Subject: [PATCH] Lots more comments. --- .../plugins/songs/lib/duplicatesongfinder.py | 104 +++++++++++++----- 1 file changed, 74 insertions(+), 30 deletions(-) diff --git a/openlp/plugins/songs/lib/duplicatesongfinder.py b/openlp/plugins/songs/lib/duplicatesongfinder.py index 6ace0a68b..bf984a5ea 100644 --- a/openlp/plugins/songs/lib/duplicatesongfinder.py +++ b/openlp/plugins/songs/lib/duplicatesongfinder.py @@ -26,19 +26,27 @@ # with this program; if not, write to the Free Software Foundation, Inc., 59 # # Temple Place, Suite 330, Boston, MA 02111-1307 USA # ############################################################################### -import logging import difflib -from openlp.core.lib import translate from openlp.plugins.songs.lib.db import Song -from openlp.plugins.songs.lib.ui import SongStrings - -log = logging.getLogger(__name__) class DuplicateSongFinder(object): """ The :class:`DuplicateSongFinder` class provides functionality to search for - and remove duplicate songs. + duplicate songs. + + The algorithm is based on the diff algorithm. + First a diffset is calculated for two songs. + To compensate for typos all differences that are smaller than a + limit (minFragmentSize) are removed and the surrounding equal parts are merged. + Finally two conditions can qualify a song tuple to be a duplicate: + 1. There is a block of equal content that is at least minBlockSize large. + This condition should hit for all larger songs that have a long enough + equal part. Even if only one verse is equal this condition should still hit. + 2. Two thirds of the smaller song is contained in the larger song. + This condition should hit if one of the two songs (or both) is small (smaller + than the minBlockSize), but most of the song is contained in the other song. """ def __init__(self): @@ -47,6 +55,15 @@ class DuplicateSongFinder(object): self.maxTypoSize = 3 def songsProbablyEqual(self, song1, song2): + """ + Calculate and return whether two songs are probably equal. + + ``song1`` + The first song to compare. + + ``song2`` + The second song to compare. + """ if len(song1.search_lyrics) < len(song2.search_lyrics): small = song1.search_lyrics large = song2.search_lyrics @@ -64,38 +81,58 @@ class DuplicateSongFinder(object): return False def __opLength(self, opcode): + """ + Return the length of a given difference. + + ``opcode`` + The difference. + """ return max(opcode[2]-opcode[1], opcode[4] - opcode[3]) def __removeTypos(self, diff): + """ + Remove typos from a diff set. A typo is a small difference (minFragmentSize). + + ``diff`` + The diff set to remove the typos from. + """ #remove typo at beginning of string - if len(diff) >= 2: - if diff[0][0] != "equal" and self.__opLength(diff[0]) <= self.maxTypoSize and \ - self.__opLength(diff[1]) >= self.minFragmentSize: - del diff[0] - #remove typos in the middle of string - if len(diff) >= 3: - for index in range(len(diff)-3, -1, -1): - if self.__opLength(diff[index]) >= self.minFragmentSize and \ - diff[index+1][0] != "equal" and self.__opLength(diff[index+1]) <= self.maxTypoSize and \ - self.__opLength(diff[index+2]) >= self.minFragmentSize: - del diff[index+1] - #remove typo at the end of string - if len(diff) >= 2: - if self.__opLength(diff[-2]) >= self.minFragmentSize and \ - diff[-1][0] != "equal" and self.__opLength(diff[-1]) <= self.maxTypoSize: - del diff[-1] - - #merge fragments - for index in range(len(diff)-2, -1, -1): - if diff[index][0] == "equal" and self.__opLength(diff[index]) >= self.minFragmentSize and \ - diff[index+1][0] == "equal" and self.__opLength(diff[index+1]) >= self.minFragmentSize: - diff[index] = ("equal", diff[index][1], diff[index+1][2], diff[index][3], - diff[index+1][4]) + if len(diff) >= 2: + if diff[0][0] != "equal" and self.__opLength(diff[0]) <= self.maxTypoSize and \ + self.__opLength(diff[1]) >= self.minFragmentSize: + del diff[0] + #remove typos in the middle of string + if len(diff) >= 3: + for index in range(len(diff)-3, -1, -1): + if self.__opLength(diff[index]) >= self.minFragmentSize and \ + diff[index+1][0] != "equal" and self.__opLength(diff[index+1]) <= self.maxTypoSize and \ + self.__opLength(diff[index+2]) >= self.minFragmentSize: del diff[index+1] + #remove typo at the end of string + if len(diff) >= 2: + if self.__opLength(diff[-2]) >= self.minFragmentSize and \ + diff[-1][0] != "equal" and self.__opLength(diff[-1]) <= self.maxTypoSize: + del diff[-1] - return diff + #merge fragments + for index in range(len(diff)-2, -1, -1): + if diff[index][0] == "equal" and self.__opLength(diff[index]) >= self.minFragmentSize and \ + diff[index+1][0] == "equal" and self.__opLength(diff[index+1]) >= self.minFragmentSize: + diff[index] = ("equal", diff[index][1], diff[index+1][2], diff[index][3], + diff[index+1][4]) + del diff[index+1] + + return diff def __lengthOfEqualBlocks(self, diff): + """ + Return the total length of all equal blocks in a diff set. + Blocks smaller than minBlockSize are not counted. + + ``diff`` + The diff set to return the length for. + """ length = 0 for element in diff: if element[0] == "equal" and self.__opLength(element) >= self.minBlockSize: @@ -103,8 +140,15 @@ class DuplicateSongFinder(object): return length def __lengthOfLongestEqualBlock(self, diff): + """ + Return the length of the largest equal block in a diff set. + + ``diff`` + The diff set to return the length for. + """ length = 0 for element in diff: if element[0] == "equal" and self.__opLength(element) > length: length = self.__opLength(element) return length +