forked from openlp/openlp
127 lines
5.6 KiB
Python
127 lines
5.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
##########################################################################
|
|
# OpenLP - Open Source Lyrics Projection #
|
|
# ---------------------------------------------------------------------- #
|
|
# Copyright (c) 2008-2022 OpenLP Developers #
|
|
# ---------------------------------------------------------------------- #
|
|
# This program is free software: you can redistribute it and/or modify #
|
|
# it under the terms of the GNU General Public License as published by #
|
|
# the Free Software Foundation, either version 3 of the License, or #
|
|
# (at your option) any later version. #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, #
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
# GNU General Public License for more details. #
|
|
# #
|
|
# You should have received a copy of the GNU General Public License #
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>. #
|
|
##########################################################################
|
|
"""
|
|
The :mod:`songcompare` module provides functionality to search for
|
|
duplicate songs. It has one single :func:`songs_probably_equal`.
|
|
|
|
| The algorithm is based on the diff algorithm.
|
|
| First a diffset is calculated for two songs.
|
|
| To compensate for typos all differences that are smaller than a
|
|
limit (<max_typo_size) and are surrounded by larger equal blocks
|
|
(>min_fragment_size) are removed and the surrounding equal parts are merged.
|
|
| Finally two conditions can qualify a song tuple to be a duplicate:
|
|
|
|
1. There is a block of equal content that is at least min_block_size large.
|
|
This condition should hit for all larger songs that have a long enough
|
|
equal part. Even if only one verse is equal this condition should still hit.
|
|
2. Two thirds of the smaller song is contained in the larger song.
|
|
This condition should hit if one of the two songs (or both) is small (smaller
|
|
than the min_block_size), but most of the song is contained in the other song.
|
|
"""
|
|
|
|
import difflib
|
|
|
|
|
|
MIN_FRAGMENT_SIZE = 5
|
|
MIN_BLOCK_SIZE = 70
|
|
MAX_TYPO_SIZE = 3
|
|
|
|
|
|
def songs_probably_equal(song_tuple):
|
|
"""
|
|
Calculate and return whether two songs are probably equal.
|
|
|
|
:param song_tuple: A tuple of two songs to compare.
|
|
"""
|
|
song1, song2 = song_tuple
|
|
pos1, lyrics1 = song1
|
|
pos2, lyrics2 = song2
|
|
if len(lyrics1) < len(lyrics2):
|
|
small = lyrics1
|
|
large = lyrics2
|
|
else:
|
|
small = lyrics2
|
|
large = lyrics1
|
|
differ = difflib.SequenceMatcher(a=large, b=small)
|
|
diff_tuples = differ.get_opcodes()
|
|
diff_no_typos = _remove_typos(diff_tuples)
|
|
# Check 1: Similarity based on the absolute length of equal parts.
|
|
# Calculate the total length of all equal blocks of the set.
|
|
# Blocks smaller than min_block_size are not counted.
|
|
length_of_equal_blocks = 0
|
|
for element in diff_no_typos:
|
|
if element[0] == "equal" and _op_length(element) >= MIN_BLOCK_SIZE:
|
|
length_of_equal_blocks += _op_length(element)
|
|
|
|
if length_of_equal_blocks >= MIN_BLOCK_SIZE:
|
|
return pos1, pos2
|
|
# Check 2: Similarity based on the relative length of the longest equal block.
|
|
# Calculate the length of the largest equal block of the diff set.
|
|
length_of_longest_equal_block = 0
|
|
for element in diff_no_typos:
|
|
if element[0] == "equal" and _op_length(element) > length_of_longest_equal_block:
|
|
length_of_longest_equal_block = _op_length(element)
|
|
if length_of_longest_equal_block > len(small) * 2 // 3:
|
|
return pos1, pos2
|
|
# Both checks failed. We assume the songs are not equal.
|
|
return None
|
|
|
|
|
|
def _op_length(opcode):
|
|
"""
|
|
Return the length of a given difference.
|
|
|
|
:param opcode: The difference.
|
|
"""
|
|
return max(opcode[2] - opcode[1], opcode[4] - opcode[3])
|
|
|
|
|
|
def _remove_typos(diff):
|
|
"""
|
|
Remove typos from a diff set. A typo is a small difference (<max_typo_size)
|
|
surrounded by larger equal passages (>min_fragment_size).
|
|
|
|
:param diff: The diff set to remove the typos from.
|
|
"""
|
|
# Remove typo at beginning of the string.
|
|
if len(diff) >= 2:
|
|
if diff[0][0] != "equal" and _op_length(diff[0]) <= MAX_TYPO_SIZE and _op_length(diff[1]) >= MIN_FRAGMENT_SIZE:
|
|
del diff[0]
|
|
# Remove typos in the middle of the string.
|
|
if len(diff) >= 3:
|
|
for index in range(len(diff) - 3, -1, -1):
|
|
if _op_length(diff[index]) >= MIN_FRAGMENT_SIZE and diff[index + 1][0] != "equal" and \
|
|
_op_length(diff[index + 1]) <= MAX_TYPO_SIZE and _op_length(diff[index + 2]) >= MIN_FRAGMENT_SIZE:
|
|
del diff[index + 1]
|
|
# Remove typo at the end of the string.
|
|
if len(diff) >= 2:
|
|
if _op_length(diff[-2]) >= MIN_FRAGMENT_SIZE and diff[-1][0] != "equal" \
|
|
and _op_length(diff[-1]) <= MAX_TYPO_SIZE:
|
|
del diff[-1]
|
|
|
|
# Merge the bordering equal passages that occured by removing differences.
|
|
for index in range(len(diff) - 2, -1, -1):
|
|
if diff[index][0] == "equal" and _op_length(diff[index]) >= MIN_FRAGMENT_SIZE and \
|
|
diff[index + 1][0] == "equal" and _op_length(diff[index + 1]) >= MIN_FRAGMENT_SIZE:
|
|
diff[index] = ("equal", diff[index][1], diff[index + 1][2], diff[index][3], diff[index + 1][4])
|
|
del diff[index + 1]
|
|
return diff
|