forked from openlp/openlp
Added basic duplicate song detection, no removal or fancy GUI yet.
This commit is contained in:
parent
603249b2a3
commit
2dfb7bec9c
114
openlp/plugins/songs/lib/doublesfinder.py
Normal file
114
openlp/plugins/songs/lib/doublesfinder.py
Normal file
@ -0,0 +1,114 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4
|
||||
|
||||
###############################################################################
|
||||
# OpenLP - Open Source Lyrics Projection #
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Copyright (c) 2008-2013 Raoul Snyman #
|
||||
# Portions copyright (c) 2008-2013 Tim Bentley, Gerald Britton, Jonathan #
|
||||
# Corwin, Samuel Findlay, Michael Gorven, Scott Guerrieri, Matthias Hub, #
|
||||
# Meinert Jordan, Armin Köhler, Erik Lundin, Edwin Lunando, Brian T. Meyer. #
|
||||
# Joshua Miller, Stevan Pettit, Andreas Preikschat, Mattias Põldaru, #
|
||||
# Christian Richter, Philip Ridout, Simon Scudder, Jeffrey Smith, #
|
||||
# Maikel Stuivenberg, Martin Thompson, Jon Tibble, Dave Warnock, #
|
||||
# Frode Woldsund, Martin Zibricky, Patrick Zimmermann #
|
||||
# --------------------------------------------------------------------------- #
|
||||
# This program is free software; you can redistribute it and/or modify it #
|
||||
# under the terms of the GNU General Public License as published by the Free #
|
||||
# Software Foundation; version 2 of the License. #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT #
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
|
||||
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
|
||||
# more details. #
|
||||
# #
|
||||
# You should have received a copy of the GNU General Public License along #
|
||||
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
|
||||
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
|
||||
###############################################################################
|
||||
"""
|
||||
The :mod:`dreambeamimport` module provides the functionality for importing
|
||||
DreamBeam songs into the OpenLP database.
|
||||
"""
|
||||
import logging
|
||||
import difflib
|
||||
|
||||
from openlp.core.lib import translate
|
||||
from openlp.plugins.songs.lib.db import Song
|
||||
from openlp.plugins.songs.lib.ui import SongStrings
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
class DuplicateSongFinder(object):
|
||||
"""
|
||||
The :class:`DreamBeamImport` class provides functionality to search for
|
||||
and remove duplicate songs.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.minFragmentSize = 5
|
||||
self.minBlockSize = 70
|
||||
self.maxTypoSize = 3
|
||||
|
||||
def songsProbablyEqual(self, song1, song2):
|
||||
if len(song1.search_lyrics) < len(song2.search_lyrics):
|
||||
small = song1.search_lyrics
|
||||
large = song2.search_lyrics
|
||||
else:
|
||||
small = song2.search_lyrics
|
||||
large = song1.search_lyrics
|
||||
differ = difflib.SequenceMatcher(a=small, b=large)
|
||||
diff_tuples = differ.get_opcodes()
|
||||
diff_no_typos = self.__removeTypos(diff_tuples)
|
||||
#print(diff_no_typos)
|
||||
if self.__lengthOfEqualBlocks(diff_no_typos) >= self.minBlockSize or \
|
||||
self.__lengthOfLongestEqualBlock(diff_no_typos) > len(small)*2/3:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def __opLength(self, opcode):
|
||||
return max(opcode[2]-opcode[1], opcode[4] - opcode[3])
|
||||
|
||||
def __removeTypos(self, diff):
|
||||
#remove typo at beginning of string
|
||||
if len(diff) >= 2:
|
||||
if diff[0][0] != "equal" and self.__opLength(diff[0]) <= self.maxTypoSize and \
|
||||
self.__opLength(diff[1]) >= self.minFragmentSize:
|
||||
del diff[0]
|
||||
#remove typos in the middle of string
|
||||
if len(diff) >= 3:
|
||||
for index in range(len(diff)-3, -1, -1):
|
||||
if self.__opLength(diff[index]) >= self.minFragmentSize and \
|
||||
diff[index+1][0] != "equal" and self.__opLength(diff[index+1]) <= self.maxTypoSize and \
|
||||
self.__opLength(diff[index+2]) >= self.minFragmentSize:
|
||||
del diff[index+1]
|
||||
#remove typo at the end of string
|
||||
if len(diff) >= 2:
|
||||
if self.__opLength(diff[-2]) >= self.minFragmentSize and \
|
||||
diff[-1][0] != "equal" and self.__opLength(diff[-1]) <= self.maxTypoSize:
|
||||
del diff[-1]
|
||||
|
||||
#merge fragments
|
||||
for index in range(len(diff)-2, -1, -1):
|
||||
if diff[index][0] == "equal" and self.__opLength(diff[index]) >= self.minFragmentSize and \
|
||||
diff[index+1][0] == "equal" and self.__opLength(diff[index+1]) >= self.minFragmentSize:
|
||||
diff[index] = ("equal", diff[index][1], diff[index+1][2], diff[index][3],
|
||||
diff[index+1][4])
|
||||
del diff[index+1]
|
||||
|
||||
return diff
|
||||
|
||||
def __lengthOfEqualBlocks(self, diff):
|
||||
length = 0
|
||||
for element in diff:
|
||||
if element[0] == "equal" and self.__opLength(element) >= self.minBlockSize:
|
||||
length += self.__opLength(element)
|
||||
return length
|
||||
|
||||
def __lengthOfLongestEqualBlock(self, diff):
|
||||
length = 0
|
||||
for element in diff:
|
||||
if element[0] == "equal" and self.__opLength(element) > length:
|
||||
length = self.__opLength(element)
|
||||
return length
|
@ -45,6 +45,7 @@ from openlp.plugins.songs.lib import clean_song, upgrade, SongMediaItem, \
|
||||
from openlp.plugins.songs.lib.db import init_schema, Song
|
||||
from openlp.plugins.songs.lib.importer import SongFormat
|
||||
from openlp.plugins.songs.lib.olpimport import OpenLPSongImport
|
||||
from openlp.plugins.songs.lib.doublesfinder import DuplicateSongFinder
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
@ -77,10 +78,12 @@ class SongsPlugin(Plugin):
|
||||
self.songImportItem.setVisible(True)
|
||||
self.songExportItem.setVisible(True)
|
||||
self.toolsReindexItem.setVisible(True)
|
||||
self.toolsFindDuplicates.setVisible(True)
|
||||
action_list = ActionList.get_instance()
|
||||
action_list.add_action(self.songImportItem, UiStrings().Import)
|
||||
action_list.add_action(self.songExportItem, UiStrings().Export)
|
||||
action_list.add_action(self.toolsReindexItem, UiStrings().Tools)
|
||||
action_list.add_action(self.toolsFindDuplicates, UiStrings().Tools)
|
||||
QtCore.QObject.connect(Receiver.get_receiver(),
|
||||
QtCore.SIGNAL(u'servicemanager_new_service'),
|
||||
self.clearTemporarySongs)
|
||||
@ -122,7 +125,7 @@ class SongsPlugin(Plugin):
|
||||
|
||||
def addToolsMenuItem(self, tools_menu):
|
||||
"""
|
||||
Give the alerts plugin the opportunity to add items to the
|
||||
Give the Songs plugin the opportunity to add items to the
|
||||
**Tools** menu.
|
||||
|
||||
``tools_menu``
|
||||
@ -137,6 +140,12 @@ class SongsPlugin(Plugin):
|
||||
'Re-index the songs database to improve searching and ordering.'),
|
||||
visible=False, triggers=self.onToolsReindexItemTriggered)
|
||||
tools_menu.addAction(self.toolsReindexItem)
|
||||
self.toolsFindDuplicates = create_action(tools_menu, u'toolsFindDuplicates',
|
||||
text=translate('SongsPlugin', 'Find &duplicate songs'),
|
||||
statustip=translate('SongsPlugin',
|
||||
'Find and remove duplicate songs in the song database.'),
|
||||
visible=False, triggers=self.onToolsFindDuplicatesTriggered)
|
||||
tools_menu.addAction(self.toolsFindDuplicates)
|
||||
|
||||
def onToolsReindexItemTriggered(self):
|
||||
"""
|
||||
@ -157,6 +166,25 @@ class SongsPlugin(Plugin):
|
||||
self.manager.save_objects(songs)
|
||||
self.mediaItem.onSearchTextButtonClicked()
|
||||
|
||||
def onToolsFindDuplicatesTriggered(self):
|
||||
"""
|
||||
Search for duplicates in the song database.
|
||||
"""
|
||||
maxSongs = self.manager.get_object_count(Song)
|
||||
if maxSongs == 0:
|
||||
return
|
||||
QtGui.QMessageBox.information(self.formParent,
|
||||
"Find duplicates called", "Called...")
|
||||
songs = self.manager.get_all_objects(Song)
|
||||
for outerSongCounter in range(maxSongs-1):
|
||||
for innerSongCounter in range(outerSongCounter+1, maxSongs):
|
||||
doubleFinder = DuplicateSongFinder()
|
||||
if doubleFinder.songsProbablyEqual(songs[outerSongCounter],
|
||||
songs[innerSongCounter]):
|
||||
QtGui.QMessageBox.information(self.formParent,
|
||||
"Double found", str(innerSongCounter) + " " +
|
||||
str(outerSongCounter))
|
||||
|
||||
def onSongImportItemClicked(self):
|
||||
if self.mediaItem:
|
||||
self.mediaItem.onImportClick()
|
||||
@ -280,10 +308,12 @@ class SongsPlugin(Plugin):
|
||||
self.songImportItem.setVisible(False)
|
||||
self.songExportItem.setVisible(False)
|
||||
self.toolsReindexItem.setVisible(False)
|
||||
self.toolsFindDuplicates.setVisible(False)
|
||||
action_list = ActionList.get_instance()
|
||||
action_list.remove_action(self.songImportItem, UiStrings().Import)
|
||||
action_list.remove_action(self.songExportItem, UiStrings().Export)
|
||||
action_list.remove_action(self.toolsReindexItem, UiStrings().Tools)
|
||||
action_list.remove_action(self.toolsFindDuplicates, UiStrings().Tools)
|
||||
Plugin.finalise(self)
|
||||
|
||||
def clearTemporarySongs(self):
|
||||
|
Loading…
Reference in New Issue
Block a user