forked from openlp/openlp
Lots more comments.
This commit is contained in:
parent
bbe9293392
commit
bc3f854921
@ -26,19 +26,27 @@
|
||||
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
|
||||
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
|
||||
###############################################################################
|
||||
import logging
|
||||
import difflib
|
||||
|
||||
from openlp.core.lib import translate
|
||||
from openlp.plugins.songs.lib.db import Song
|
||||
from openlp.plugins.songs.lib.ui import SongStrings
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
class DuplicateSongFinder(object):
|
||||
"""
|
||||
The :class:`DuplicateSongFinder` class provides functionality to search for
|
||||
and remove duplicate songs.
|
||||
duplicate songs.
|
||||
|
||||
The algorithm is based on the diff algorithm.
|
||||
First a diffset is calculated for two songs.
|
||||
To compensate for typos all differences that are smaller than a
|
||||
limit (<maxTypoSize) and are surrounded by larger equal blocks
|
||||
(>minFragmentSize) are removed and the surrounding equal parts are merged.
|
||||
Finally two conditions can qualify a song tuple to be a duplicate:
|
||||
1. There is a block of equal content that is at least minBlockSize large.
|
||||
This condition should hit for all larger songs that have a long enough
|
||||
equal part. Even if only one verse is equal this condition should still hit.
|
||||
2. Two thirds of the smaller song is contained in the larger song.
|
||||
This condition should hit if one of the two songs (or both) is small (smaller
|
||||
than the minBlockSize), but most of the song is contained in the other song.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@ -47,6 +55,15 @@ class DuplicateSongFinder(object):
|
||||
self.maxTypoSize = 3
|
||||
|
||||
def songsProbablyEqual(self, song1, song2):
|
||||
"""
|
||||
Calculate and return whether two songs are probably equal.
|
||||
|
||||
``song1``
|
||||
The first song to compare.
|
||||
|
||||
``song2``
|
||||
The second song to compare.
|
||||
"""
|
||||
if len(song1.search_lyrics) < len(song2.search_lyrics):
|
||||
small = song1.search_lyrics
|
||||
large = song2.search_lyrics
|
||||
@ -64,9 +81,22 @@ class DuplicateSongFinder(object):
|
||||
return False
|
||||
|
||||
def __opLength(self, opcode):
|
||||
"""
|
||||
Return the length of a given difference.
|
||||
|
||||
``opcode``
|
||||
The difference.
|
||||
"""
|
||||
return max(opcode[2]-opcode[1], opcode[4] - opcode[3])
|
||||
|
||||
def __removeTypos(self, diff):
|
||||
"""
|
||||
Remove typos from a diff set. A typo is a small difference (<maxTypoSize)
|
||||
surrounded by larger equal passages (>minFragmentSize).
|
||||
|
||||
``diff``
|
||||
The diff set to remove the typos from.
|
||||
"""
|
||||
#remove typo at beginning of string
|
||||
if len(diff) >= 2:
|
||||
if diff[0][0] != "equal" and self.__opLength(diff[0]) <= self.maxTypoSize and \
|
||||
@ -96,6 +126,13 @@ class DuplicateSongFinder(object):
|
||||
return diff
|
||||
|
||||
def __lengthOfEqualBlocks(self, diff):
|
||||
"""
|
||||
Return the total length of all equal blocks in a diff set.
|
||||
Blocks smaller than minBlockSize are not counted.
|
||||
|
||||
``diff``
|
||||
The diff set to return the length for.
|
||||
"""
|
||||
length = 0
|
||||
for element in diff:
|
||||
if element[0] == "equal" and self.__opLength(element) >= self.minBlockSize:
|
||||
@ -103,8 +140,15 @@ class DuplicateSongFinder(object):
|
||||
return length
|
||||
|
||||
def __lengthOfLongestEqualBlock(self, diff):
|
||||
"""
|
||||
Return the length of the largest equal block in a diff set.
|
||||
|
||||
``diff``
|
||||
The diff set to return the length for.
|
||||
"""
|
||||
length = 0
|
||||
for element in diff:
|
||||
if element[0] == "equal" and self.__opLength(element) > length:
|
||||
length = self.__opLength(element)
|
||||
return length
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user