forked from openlp/openlp
Lots more comments.
This commit is contained in:
parent
bbe9293392
commit
bc3f854921
@ -26,19 +26,27 @@
|
|||||||
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
|
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
|
||||||
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
|
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
|
||||||
###############################################################################
|
###############################################################################
|
||||||
import logging
|
|
||||||
import difflib
|
import difflib
|
||||||
|
|
||||||
from openlp.core.lib import translate
|
|
||||||
from openlp.plugins.songs.lib.db import Song
|
from openlp.plugins.songs.lib.db import Song
|
||||||
from openlp.plugins.songs.lib.ui import SongStrings
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
class DuplicateSongFinder(object):
|
class DuplicateSongFinder(object):
|
||||||
"""
|
"""
|
||||||
The :class:`DuplicateSongFinder` class provides functionality to search for
|
The :class:`DuplicateSongFinder` class provides functionality to search for
|
||||||
and remove duplicate songs.
|
duplicate songs.
|
||||||
|
|
||||||
|
The algorithm is based on the diff algorithm.
|
||||||
|
First a diffset is calculated for two songs.
|
||||||
|
To compensate for typos all differences that are smaller than a
|
||||||
|
limit (<maxTypoSize) and are surrounded by larger equal blocks
|
||||||
|
(>minFragmentSize) are removed and the surrounding equal parts are merged.
|
||||||
|
Finally two conditions can qualify a song tuple to be a duplicate:
|
||||||
|
1. There is a block of equal content that is at least minBlockSize large.
|
||||||
|
This condition should hit for all larger songs that have a long enough
|
||||||
|
equal part. Even if only one verse is equal this condition should still hit.
|
||||||
|
2. Two thirds of the smaller song is contained in the larger song.
|
||||||
|
This condition should hit if one of the two songs (or both) is small (smaller
|
||||||
|
than the minBlockSize), but most of the song is contained in the other song.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -47,6 +55,15 @@ class DuplicateSongFinder(object):
|
|||||||
self.maxTypoSize = 3
|
self.maxTypoSize = 3
|
||||||
|
|
||||||
def songsProbablyEqual(self, song1, song2):
|
def songsProbablyEqual(self, song1, song2):
|
||||||
|
"""
|
||||||
|
Calculate and return whether two songs are probably equal.
|
||||||
|
|
||||||
|
``song1``
|
||||||
|
The first song to compare.
|
||||||
|
|
||||||
|
``song2``
|
||||||
|
The second song to compare.
|
||||||
|
"""
|
||||||
if len(song1.search_lyrics) < len(song2.search_lyrics):
|
if len(song1.search_lyrics) < len(song2.search_lyrics):
|
||||||
small = song1.search_lyrics
|
small = song1.search_lyrics
|
||||||
large = song2.search_lyrics
|
large = song2.search_lyrics
|
||||||
@ -64,9 +81,22 @@ class DuplicateSongFinder(object):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def __opLength(self, opcode):
|
def __opLength(self, opcode):
|
||||||
|
"""
|
||||||
|
Return the length of a given difference.
|
||||||
|
|
||||||
|
``opcode``
|
||||||
|
The difference.
|
||||||
|
"""
|
||||||
return max(opcode[2]-opcode[1], opcode[4] - opcode[3])
|
return max(opcode[2]-opcode[1], opcode[4] - opcode[3])
|
||||||
|
|
||||||
def __removeTypos(self, diff):
|
def __removeTypos(self, diff):
|
||||||
|
"""
|
||||||
|
Remove typos from a diff set. A typo is a small difference (<maxTypoSize)
|
||||||
|
surrounded by larger equal passages (>minFragmentSize).
|
||||||
|
|
||||||
|
``diff``
|
||||||
|
The diff set to remove the typos from.
|
||||||
|
"""
|
||||||
#remove typo at beginning of string
|
#remove typo at beginning of string
|
||||||
if len(diff) >= 2:
|
if len(diff) >= 2:
|
||||||
if diff[0][0] != "equal" and self.__opLength(diff[0]) <= self.maxTypoSize and \
|
if diff[0][0] != "equal" and self.__opLength(diff[0]) <= self.maxTypoSize and \
|
||||||
@ -96,6 +126,13 @@ class DuplicateSongFinder(object):
|
|||||||
return diff
|
return diff
|
||||||
|
|
||||||
def __lengthOfEqualBlocks(self, diff):
|
def __lengthOfEqualBlocks(self, diff):
|
||||||
|
"""
|
||||||
|
Return the total length of all equal blocks in a diff set.
|
||||||
|
Blocks smaller than minBlockSize are not counted.
|
||||||
|
|
||||||
|
``diff``
|
||||||
|
The diff set to return the length for.
|
||||||
|
"""
|
||||||
length = 0
|
length = 0
|
||||||
for element in diff:
|
for element in diff:
|
||||||
if element[0] == "equal" and self.__opLength(element) >= self.minBlockSize:
|
if element[0] == "equal" and self.__opLength(element) >= self.minBlockSize:
|
||||||
@ -103,8 +140,15 @@ class DuplicateSongFinder(object):
|
|||||||
return length
|
return length
|
||||||
|
|
||||||
def __lengthOfLongestEqualBlock(self, diff):
|
def __lengthOfLongestEqualBlock(self, diff):
|
||||||
|
"""
|
||||||
|
Return the length of the largest equal block in a diff set.
|
||||||
|
|
||||||
|
``diff``
|
||||||
|
The diff set to return the length for.
|
||||||
|
"""
|
||||||
length = 0
|
length = 0
|
||||||
for element in diff:
|
for element in diff:
|
||||||
if element[0] == "equal" and self.__opLength(element) > length:
|
if element[0] == "equal" and self.__opLength(element) > length:
|
||||||
length = self.__opLength(element)
|
length = self.__opLength(element)
|
||||||
return length
|
return length
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user