# -*- coding: utf-8 -*- ########################################################################## # OpenLP - Open Source Lyrics Projection # # ---------------------------------------------------------------------- # # Copyright (c) 2008-2022 OpenLP Developers # # ---------------------------------------------------------------------- # # This program is free software: you can redistribute it and/or modify # # it under the terms of the GNU General Public License as published by # # the Free Software Foundation, either version 3 of the License, or # # (at your option) any later version. # # # # This program is distributed in the hope that it will be useful, # # but WITHOUT ANY WARRANTY; without even the implied warranty of # # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # # GNU General Public License for more details. # # # # You should have received a copy of the GNU General Public License # # along with this program. If not, see . # ########################################################################## """ The :mod:`songcompare` module provides functionality to search for duplicate songs. It has one single :func:`songs_probably_equal`. | The algorithm is based on the diff algorithm. | First a diffset is calculated for two songs. | To compensate for typos all differences that are smaller than a limit (min_fragment_size) are removed and the surrounding equal parts are merged. | Finally two conditions can qualify a song tuple to be a duplicate: 1. There is a block of equal content that is at least min_block_size large. This condition should hit for all larger songs that have a long enough equal part. Even if only one verse is equal this condition should still hit. 2. Two thirds of the smaller song is contained in the larger song. This condition should hit if one of the two songs (or both) is small (smaller than the min_block_size), but most of the song is contained in the other song. """ import difflib MIN_FRAGMENT_SIZE = 5 MIN_BLOCK_SIZE = 70 MAX_TYPO_SIZE = 3 def songs_probably_equal(song_tuple): """ Calculate and return whether two songs are probably equal. :param song_tuple: A tuple of two songs to compare. """ song1, song2 = song_tuple pos1, lyrics1 = song1 pos2, lyrics2 = song2 if len(lyrics1) < len(lyrics2): small = lyrics1 large = lyrics2 else: small = lyrics2 large = lyrics1 differ = difflib.SequenceMatcher(a=large, b=small) diff_tuples = differ.get_opcodes() diff_no_typos = _remove_typos(diff_tuples) # Check 1: Similarity based on the absolute length of equal parts. # Calculate the total length of all equal blocks of the set. # Blocks smaller than min_block_size are not counted. length_of_equal_blocks = 0 for element in diff_no_typos: if element[0] == "equal" and _op_length(element) >= MIN_BLOCK_SIZE: length_of_equal_blocks += _op_length(element) if length_of_equal_blocks >= MIN_BLOCK_SIZE: return pos1, pos2 # Check 2: Similarity based on the relative length of the longest equal block. # Calculate the length of the largest equal block of the diff set. length_of_longest_equal_block = 0 for element in diff_no_typos: if element[0] == "equal" and _op_length(element) > length_of_longest_equal_block: length_of_longest_equal_block = _op_length(element) if length_of_longest_equal_block > len(small) * 2 // 3: return pos1, pos2 # Both checks failed. We assume the songs are not equal. return None def _op_length(opcode): """ Return the length of a given difference. :param opcode: The difference. """ return max(opcode[2] - opcode[1], opcode[4] - opcode[3]) def _remove_typos(diff): """ Remove typos from a diff set. A typo is a small difference (min_fragment_size). :param diff: The diff set to remove the typos from. """ # Remove typo at beginning of the string. if len(diff) >= 2: if diff[0][0] != "equal" and _op_length(diff[0]) <= MAX_TYPO_SIZE and _op_length(diff[1]) >= MIN_FRAGMENT_SIZE: del diff[0] # Remove typos in the middle of the string. if len(diff) >= 3: for index in range(len(diff) - 3, -1, -1): if _op_length(diff[index]) >= MIN_FRAGMENT_SIZE and diff[index + 1][0] != "equal" and \ _op_length(diff[index + 1]) <= MAX_TYPO_SIZE and _op_length(diff[index + 2]) >= MIN_FRAGMENT_SIZE: del diff[index + 1] # Remove typo at the end of the string. if len(diff) >= 2: if _op_length(diff[-2]) >= MIN_FRAGMENT_SIZE and diff[-1][0] != "equal" \ and _op_length(diff[-1]) <= MAX_TYPO_SIZE: del diff[-1] # Merge the bordering equal passages that occured by removing differences. for index in range(len(diff) - 2, -1, -1): if diff[index][0] == "equal" and _op_length(diff[index]) >= MIN_FRAGMENT_SIZE and \ diff[index + 1][0] == "equal" and _op_length(diff[index + 1]) >= MIN_FRAGMENT_SIZE: diff[index] = ("equal", diff[index][1], diff[index + 1][2], diff[index][3], diff[index + 1][4]) del diff[index + 1] return diff