Lots more comments.

This commit is contained in:
Patrick Zimmermann 2013-01-22 22:11:01 +01:00
parent bbe9293392
commit bc3f854921

View File

@ -26,19 +26,27 @@
# with this program; if not, write to the Free Software Foundation, Inc., 59 # # with this program; if not, write to the Free Software Foundation, Inc., 59 #
# Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Temple Place, Suite 330, Boston, MA 02111-1307 USA #
############################################################################### ###############################################################################
import logging
import difflib import difflib
from openlp.core.lib import translate
from openlp.plugins.songs.lib.db import Song from openlp.plugins.songs.lib.db import Song
from openlp.plugins.songs.lib.ui import SongStrings
log = logging.getLogger(__name__)
class DuplicateSongFinder(object): class DuplicateSongFinder(object):
""" """
The :class:`DuplicateSongFinder` class provides functionality to search for The :class:`DuplicateSongFinder` class provides functionality to search for
and remove duplicate songs. duplicate songs.
The algorithm is based on the diff algorithm.
First a diffset is calculated for two songs.
To compensate for typos all differences that are smaller than a
limit (<maxTypoSize) and are surrounded by larger equal blocks
(>minFragmentSize) are removed and the surrounding equal parts are merged.
Finally two conditions can qualify a song tuple to be a duplicate:
1. There is a block of equal content that is at least minBlockSize large.
This condition should hit for all larger songs that have a long enough
equal part. Even if only one verse is equal this condition should still hit.
2. Two thirds of the smaller song is contained in the larger song.
This condition should hit if one of the two songs (or both) is small (smaller
than the minBlockSize), but most of the song is contained in the other song.
""" """
def __init__(self): def __init__(self):
@ -47,6 +55,15 @@ class DuplicateSongFinder(object):
self.maxTypoSize = 3 self.maxTypoSize = 3
def songsProbablyEqual(self, song1, song2): def songsProbablyEqual(self, song1, song2):
"""
Calculate and return whether two songs are probably equal.
``song1``
The first song to compare.
``song2``
The second song to compare.
"""
if len(song1.search_lyrics) < len(song2.search_lyrics): if len(song1.search_lyrics) < len(song2.search_lyrics):
small = song1.search_lyrics small = song1.search_lyrics
large = song2.search_lyrics large = song2.search_lyrics
@ -64,9 +81,22 @@ class DuplicateSongFinder(object):
return False return False
def __opLength(self, opcode): def __opLength(self, opcode):
"""
Return the length of a given difference.
``opcode``
The difference.
"""
return max(opcode[2]-opcode[1], opcode[4] - opcode[3]) return max(opcode[2]-opcode[1], opcode[4] - opcode[3])
def __removeTypos(self, diff): def __removeTypos(self, diff):
"""
Remove typos from a diff set. A typo is a small difference (<maxTypoSize)
surrounded by larger equal passages (>minFragmentSize).
``diff``
The diff set to remove the typos from.
"""
#remove typo at beginning of string #remove typo at beginning of string
if len(diff) >= 2: if len(diff) >= 2:
if diff[0][0] != "equal" and self.__opLength(diff[0]) <= self.maxTypoSize and \ if diff[0][0] != "equal" and self.__opLength(diff[0]) <= self.maxTypoSize and \
@ -96,6 +126,13 @@ class DuplicateSongFinder(object):
return diff return diff
def __lengthOfEqualBlocks(self, diff): def __lengthOfEqualBlocks(self, diff):
"""
Return the total length of all equal blocks in a diff set.
Blocks smaller than minBlockSize are not counted.
``diff``
The diff set to return the length for.
"""
length = 0 length = 0
for element in diff: for element in diff:
if element[0] == "equal" and self.__opLength(element) >= self.minBlockSize: if element[0] == "equal" and self.__opLength(element) >= self.minBlockSize:
@ -103,8 +140,15 @@ class DuplicateSongFinder(object):
return length return length
def __lengthOfLongestEqualBlock(self, diff): def __lengthOfLongestEqualBlock(self, diff):
"""
Return the length of the largest equal block in a diff set.
``diff``
The diff set to return the length for.
"""
length = 0 length = 0
for element in diff: for element in diff:
if element[0] == "equal" and self.__opLength(element) > length: if element[0] == "equal" and self.__opLength(element) > length:
length = self.__opLength(element) length = self.__opLength(element)
return length return length