Lots more comments.

This commit is contained in:
Patrick Zimmermann 2013-01-22 22:11:01 +01:00
parent bbe9293392
commit bc3f854921
1 changed files with 74 additions and 30 deletions

View File

@ -26,19 +26,27 @@
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
###############################################################################
import logging
import difflib
from openlp.core.lib import translate
from openlp.plugins.songs.lib.db import Song
from openlp.plugins.songs.lib.ui import SongStrings
log = logging.getLogger(__name__)
class DuplicateSongFinder(object):
"""
The :class:`DuplicateSongFinder` class provides functionality to search for
and remove duplicate songs.
duplicate songs.
The algorithm is based on the diff algorithm.
First a diffset is calculated for two songs.
To compensate for typos all differences that are smaller than a
limit (<maxTypoSize) and are surrounded by larger equal blocks
(>minFragmentSize) are removed and the surrounding equal parts are merged.
Finally two conditions can qualify a song tuple to be a duplicate:
1. There is a block of equal content that is at least minBlockSize large.
This condition should hit for all larger songs that have a long enough
equal part. Even if only one verse is equal this condition should still hit.
2. Two thirds of the smaller song is contained in the larger song.
This condition should hit if one of the two songs (or both) is small (smaller
than the minBlockSize), but most of the song is contained in the other song.
"""
def __init__(self):
@ -47,6 +55,15 @@ class DuplicateSongFinder(object):
self.maxTypoSize = 3
def songsProbablyEqual(self, song1, song2):
"""
Calculate and return whether two songs are probably equal.
``song1``
The first song to compare.
``song2``
The second song to compare.
"""
if len(song1.search_lyrics) < len(song2.search_lyrics):
small = song1.search_lyrics
large = song2.search_lyrics
@ -64,38 +81,58 @@ class DuplicateSongFinder(object):
return False
def __opLength(self, opcode):
"""
Return the length of a given difference.
``opcode``
The difference.
"""
return max(opcode[2]-opcode[1], opcode[4] - opcode[3])
def __removeTypos(self, diff):
"""
Remove typos from a diff set. A typo is a small difference (<maxTypoSize)
surrounded by larger equal passages (>minFragmentSize).
``diff``
The diff set to remove the typos from.
"""
#remove typo at beginning of string
if len(diff) >= 2:
if diff[0][0] != "equal" and self.__opLength(diff[0]) <= self.maxTypoSize and \
self.__opLength(diff[1]) >= self.minFragmentSize:
del diff[0]
#remove typos in the middle of string
if len(diff) >= 3:
for index in range(len(diff)-3, -1, -1):
if self.__opLength(diff[index]) >= self.minFragmentSize and \
diff[index+1][0] != "equal" and self.__opLength(diff[index+1]) <= self.maxTypoSize and \
self.__opLength(diff[index+2]) >= self.minFragmentSize:
del diff[index+1]
#remove typo at the end of string
if len(diff) >= 2:
if self.__opLength(diff[-2]) >= self.minFragmentSize and \
diff[-1][0] != "equal" and self.__opLength(diff[-1]) <= self.maxTypoSize:
del diff[-1]
#merge fragments
for index in range(len(diff)-2, -1, -1):
if diff[index][0] == "equal" and self.__opLength(diff[index]) >= self.minFragmentSize and \
diff[index+1][0] == "equal" and self.__opLength(diff[index+1]) >= self.minFragmentSize:
diff[index] = ("equal", diff[index][1], diff[index+1][2], diff[index][3],
diff[index+1][4])
if len(diff) >= 2:
if diff[0][0] != "equal" and self.__opLength(diff[0]) <= self.maxTypoSize and \
self.__opLength(diff[1]) >= self.minFragmentSize:
del diff[0]
#remove typos in the middle of string
if len(diff) >= 3:
for index in range(len(diff)-3, -1, -1):
if self.__opLength(diff[index]) >= self.minFragmentSize and \
diff[index+1][0] != "equal" and self.__opLength(diff[index+1]) <= self.maxTypoSize and \
self.__opLength(diff[index+2]) >= self.minFragmentSize:
del diff[index+1]
#remove typo at the end of string
if len(diff) >= 2:
if self.__opLength(diff[-2]) >= self.minFragmentSize and \
diff[-1][0] != "equal" and self.__opLength(diff[-1]) <= self.maxTypoSize:
del diff[-1]
return diff
#merge fragments
for index in range(len(diff)-2, -1, -1):
if diff[index][0] == "equal" and self.__opLength(diff[index]) >= self.minFragmentSize and \
diff[index+1][0] == "equal" and self.__opLength(diff[index+1]) >= self.minFragmentSize:
diff[index] = ("equal", diff[index][1], diff[index+1][2], diff[index][3],
diff[index+1][4])
del diff[index+1]
return diff
def __lengthOfEqualBlocks(self, diff):
"""
Return the total length of all equal blocks in a diff set.
Blocks smaller than minBlockSize are not counted.
``diff``
The diff set to return the length for.
"""
length = 0
for element in diff:
if element[0] == "equal" and self.__opLength(element) >= self.minBlockSize:
@ -103,8 +140,15 @@ class DuplicateSongFinder(object):
return length
def __lengthOfLongestEqualBlock(self, diff):
"""
Return the length of the largest equal block in a diff set.
``diff``
The diff set to return the length for.
"""
length = 0
for element in diff:
if element[0] == "equal" and self.__opLength(element) > length:
length = self.__opLength(element)
return length