From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from lists.gentoo.org (pigeon.gentoo.org [208.92.234.80]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by finch.gentoo.org (Postfix) with ESMTPS id 70EB413832E for ; Mon, 25 Jul 2016 02:55:05 +0000 (UTC) Received: from pigeon.gentoo.org (localhost [127.0.0.1]) by pigeon.gentoo.org (Postfix) with SMTP id B63A221C067; Mon, 25 Jul 2016 02:55:01 +0000 (UTC) Received: from smtp.gentoo.org (smtp.gentoo.org [140.211.166.183]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by pigeon.gentoo.org (Postfix) with ESMTPS id 1914821C065 for ; Mon, 25 Jul 2016 02:55:00 +0000 (UTC) Received: from localhost.localdomain (ip68-5-185-102.oc.oc.cox.net [68.5.185.102]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-SHA256 (128/128 bits)) (No client certificate requested) (Authenticated sender: zmedico) by smtp.gentoo.org (Postfix) with ESMTPSA id 72316340CD6; Mon, 25 Jul 2016 02:54:59 +0000 (UTC) From: Zac Medico To: gentoo-portage-dev@lists.gentoo.org Cc: Zac Medico Subject: [gentoo-portage-dev] [PATCH] emerge: add --fuzzy-search and --search-similarity (bug 65566) Date: Sun, 24 Jul 2016 19:54:40 -0700 Message-Id: <1469415280-4900-1-git-send-email-zmedico@gentoo.org> X-Mailer: git-send-email 2.7.4 Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-Id: Gentoo Linux mail X-BeenThere: gentoo-portage-dev@lists.gentoo.org Reply-to: gentoo-portage-dev@lists.gentoo.org X-Archives-Salt: 3358c8cd-36ed-453e-88de-e6118db88c6b X-Archives-Hash: 43aba739d5f9b629b98f23bd618a6310 Add --fuzzy-search option, and --search-similarity option to adjust the minimum similarity for search results (defaults to 80%). X-Gentoo-bug: 65566 X-Gentoo-bug-url: https://bugs.gentoo.org/show_bug.cgi?id=65566 --- man/emerge.1 | 14 ++++++++++++++ pym/_emerge/actions.py | 7 +++++-- pym/_emerge/main.py | 32 +++++++++++++++++++++++++++++++- pym/_emerge/search.py | 26 ++++++++++++++++++++++++-- 4 files changed, 74 insertions(+), 5 deletions(-) diff --git a/man/emerge.1 b/man/emerge.1 index da1d852..7442220 100644 --- a/man/emerge.1 +++ b/man/emerge.1 @@ -565,6 +565,14 @@ packages (fetch things from SRC_URI based upon USE setting). Instead of doing any package building, just perform fetches for all packages (fetch everything in SRC_URI regardless of USE setting). .TP +.BR "\-\-fuzzy\-search [ y | n ]" +Enable or disable fuzzy search for search actions. When fuzzy search +is enabled, a result is returned if it is sufficiently similar to the +search string, without requiring an exact match. This option is enabled +by default. Fuzzy search does not support regular expressions, therefore +it is automatically disabled for regular expression searches. Fuzzy +search is slightly slower than non\-fuzzy search. +.TP .BR "\-\-getbinpkg [ y | n ] (\-g short option)" Using the server and location defined in \fIPORTAGE_BINHOST\fR (see \fBmake.conf\fR(5)), portage will download the information from each binary @@ -874,6 +882,12 @@ enabled by default. The search index needs to be regenerated by to \fBEMERGE_DEFAULT_OPTS\fR (see \fBmake.conf\fR(5)) and later overridden via the command line. .TP +.BR "\-\-search\-similarity PERCENTAGE" +Set the minimum similarity percentage (a floating-point number between +0 and 100). Search results with similarity percentages lower than this +are discarded (default: \'80\'). This option has no effect unless the +\fB\-\-fuzzy\-search\fR option is enabled. +.TP .BR "\-\-select [ y | n ] (\-w short option)" Add specified packages to the world set (inverse of \fB\-\-oneshot\fR). This is useful if you want to diff --git a/pym/_emerge/actions.py b/pym/_emerge/actions.py index 1dc2b0d..6704afc 100644 --- a/pym/_emerge/actions.py +++ b/pym/_emerge/actions.py @@ -1,4 +1,4 @@ -# Copyright 1999-2015 Gentoo Foundation +# Copyright 1999-2016 Gentoo Foundation # Distributed under the terms of the GNU General Public License v2 from __future__ import division, print_function, unicode_literals @@ -1974,7 +1974,10 @@ def action_search(root_config, myopts, myfiles, spinner): spinner, "--searchdesc" in myopts, "--quiet" not in myopts, "--usepkg" in myopts, "--usepkgonly" in myopts, - search_index = myopts.get("--search-index", "y") != "n") + search_index=myopts.get("--search-index", "y") != "n", + search_similarity=myopts.get("--search-similarity"), + fuzzy=myopts.get("--fuzzy-search") != "n", + ) for mysearch in myfiles: try: searchinstance.execute(mysearch) diff --git a/pym/_emerge/main.py b/pym/_emerge/main.py index 0e672a2..eae1954 100644 --- a/pym/_emerge/main.py +++ b/pym/_emerge/main.py @@ -1,4 +1,4 @@ -# Copyright 1999-2015 Gentoo Foundation +# Copyright 1999-2016 Gentoo Foundation # Distributed under the terms of the GNU General Public License v2 from __future__ import print_function @@ -141,6 +141,7 @@ def insert_optional_args(args): '--deselect' : y_or_n, '--binpkg-respect-use' : y_or_n, '--fail-clean' : y_or_n, + '--fuzzy-search' : y_or_n, '--getbinpkg' : y_or_n, '--getbinpkgonly' : y_or_n, '--jobs' : valid_integers, @@ -458,6 +459,11 @@ def parse_opts(tmpcmdline, silent=False): "choices" : true_y_or_n }, + "--fuzzy-search": { + "help": "Enable or disable fuzzy search", + "choices": true_y_or_n + }, + "--ignore-built-slot-operator-deps": { "help": "Ignore the slot/sub-slot := operator parts of dependencies that have " "been recorded when packages where built. This option is intended " @@ -658,6 +664,12 @@ def parse_opts(tmpcmdline, silent=False): "choices": y_or_n }, + "--search-similarity": { + "help": ("Set minimum similarity percentage for fuzzy seach " + "(a floating-point number between 0 and 100)"), + "action": "store" + }, + "--select": { "shortopt" : "-w", "help" : "add specified packages to the world set " + \ @@ -855,6 +867,9 @@ def parse_opts(tmpcmdline, silent=False): if myoptions.fail_clean in true_y: myoptions.fail_clean = True + if myoptions.fuzzy_search in true_y: + myoptions.fuzzy_search = True + if myoptions.getbinpkg in true_y: myoptions.getbinpkg = True else: @@ -1009,6 +1024,21 @@ def parse_opts(tmpcmdline, silent=False): myoptions.rebuilt_binaries_timestamp = rebuilt_binaries_timestamp + if myoptions.search_similarity: + try: + search_similarity = float(myoptions.search_similarity) + except ValueError: + parser.error("Invalid --search-similarity parameter " + "(not a number): '{}'\n".format( + myoptions.search_similarity)) + + if search_similarity < 0 or search_similarity > 100: + parser.error("Invalid --search-similarity parameter " + "(not between 0 and 100): '{}'\n".format( + myoptions.search_similarity)) + + myoptions.search_similarity = search_similarity + if myoptions.use_ebuild_visibility in true_y: myoptions.use_ebuild_visibility = True else: diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py index 32d326e..20a0c02 100644 --- a/pym/_emerge/search.py +++ b/pym/_emerge/search.py @@ -1,8 +1,9 @@ -# Copyright 1999-2015 Gentoo Foundation +# Copyright 1999-2016 Gentoo Foundation # Distributed under the terms of the GNU General Public License v2 from __future__ import unicode_literals +import difflib import re import portage from portage import os @@ -28,7 +29,8 @@ class search(object): # public interface # def __init__(self, root_config, spinner, searchdesc, - verbose, usepkg, usepkgonly, search_index=True): + verbose, usepkg, usepkgonly, search_index=True, + search_similarity=None, fuzzy=True): """Searches the available and installed packages for the supplied search key. The list of available and installed packages is created at object instantiation. This makes successive searches faster.""" @@ -42,6 +44,9 @@ class search(object): self.spinner = None self.root_config = root_config self.setconfig = root_config.setconfig + self.fuzzy = fuzzy + self.search_similarity = (80 if search_similarity is None + else search_similarity) self.matches = {"pkg" : []} self.mlen = 0 @@ -248,11 +253,26 @@ class search(object): if self.searchkey.startswith('@'): match_category = 1 self.searchkey = self.searchkey[1:] + fuzzy = False if regexsearch: self.searchre=re.compile(self.searchkey,re.I) else: self.searchre=re.compile(re.escape(self.searchkey), re.I) + # Fuzzy search does not support regular expressions, therefore + # it is disabled for regular expression searches. + if self.fuzzy: + fuzzy = True + cutoff = float(self.search_similarity) / 100 + seq_match = difflib.SequenceMatcher() + seq_match.set_seq2(self.searchkey.lower()) + + def fuzzy_search(match_string): + seq_match.set_seq1(match_string.lower()) + return (seq_match.real_quick_ratio() >= cutoff and + seq_match.quick_ratio() >= cutoff and + seq_match.ratio() >= cutoff) + for package in self._cp_all(): self._spinner_update() @@ -263,6 +283,8 @@ class search(object): if self.searchre.search(match_string): yield ("pkg", package) + elif fuzzy and fuzzy_search(match_string): + yield ("pkg", package) elif self.searchdesc: # DESCRIPTION searching # Use _first_cp to avoid an expensive visibility check, # since the visibility check can be avoided entirely -- 2.7.4