[gentoo-portage-dev] [PATCH] emerge --search: use description index

public inbox for gentoo-portage-dev@lists.gentoo.org
 help / color / mirror / Atom feed

* [gentoo-portage-dev] [PATCH] emerge --search: use description index
@ 2014-10-18  3:28 Zac Medico
  2014-10-18  5:59 ` [gentoo-portage-dev] " Zac Medico
  2014-11-01 22:46 ` [gentoo-portage-dev] Zac Medico
  0 siblings, 2 replies; 29+ messages in thread
From: Zac Medico @ 2014-10-18  3:28 UTC (permalink / raw
  To: gentoo-portage-dev

This adds an egencache --update-pkg-desc-index action which generates
a plain-text index of package names, versions, and descriptions. The
index can then be used to optimize emerge --search / --searchdesc
actions. If the package description index is missing from a particular
repository, then all metadata for that repository is obtained using the
normal pordbapi.aux_get method.

Searching of installed packages is optimized to take advantage of
vardbdbapi._aux_cache, which is backed by vardb_metadata.pickle.
See the IndexedVardb docstring some more details.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
 bin/egencache         |  43 ++++++++++-
 man/egencache.1       |   4 ++
 man/portage.5         |   6 ++
 pym/_emerge/search.py | 196 ++++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 232 insertions(+), 17 deletions(-)

diff --git a/bin/egencache b/bin/egencache
index e366058..90d5e68 100755
--- a/bin/egencache
+++ b/bin/egencache
@@ -57,7 +57,7 @@ from portage.util._async.run_main_scheduler import run_main_scheduler
 from portage.util._eventloop.global_event_loop import global_event_loop
 from portage import cpv_getkey
 from portage.dep import Atom, isjustname
-from portage.versions import pkgsplit, vercmp
+from portage.versions import pkgsplit, vercmp, _pkg_str
 
 try:
 	from xml.etree import ElementTree
@@ -91,6 +91,9 @@ def parse_args(args):
 	actions.add_argument("--update-changelogs",
 		action="store_true",
 		help="update the ChangeLog files from SCM logs")
+	actions.add_argument("--update-pkg-desc-index",
+		action="store_true",
+		help="update package description index")
 	actions.add_argument("--update-manifests",
 		action="store_true",
 		help="update manifests")
@@ -451,6 +454,35 @@ class GenCache(object):
 		if hasattr(trg_cache, '_prune_empty_dirs'):
 			trg_cache._prune_empty_dirs()
 
+class GenPkgDescIndex(object):
+	def __init__(self, portdb, output_file):
+		self.returncode = os.EX_OK
+		self._portdb = portdb
+		self._output_file = output_file
+
+	def run(self):
+
+		portage.util.ensure_dirs(os.path.dirname(self._output_file))
+		f = portage.util.atomic_ofstream(self._output_file,
+			encoding=_encodings["repo.content"])
+
+		portdb = self._portdb
+		for cp in portdb.cp_all():
+			pkgs = portdb.cp_list(cp)
+			if not pkgs:
+				continue
+			desc, = portdb.aux_get(pkgs[-1], ["DESCRIPTION"])
+
+			if len(pkgs) == 1:
+				output = "%s: %s\n" % (pkgs[0], desc)
+			else:
+				output = "%s,%s: %s\n" % (pkgs[0],
+					",".join(_pkg_str(cpv).version
+					for cpv in pkgs[1:]), desc)
+			f.write(output)
+
+		f.close()
+
 class GenUseLocalDesc(object):
 	def __init__(self, portdb, output=None,
 			preserve_comments=False):
@@ -893,7 +925,8 @@ def egencache_main(args):
 			local_config=False, env=env)
 
 	if not (options.update or options.update_use_local_desc or
-			options.update_changelogs or options.update_manifests):
+			options.update_changelogs or options.update_manifests or
+			options.update_pkg_desc_index):
 		parser.error('No action specified')
 		return 1
 
@@ -1057,6 +1090,12 @@ def egencache_main(args):
 		else:
 			ret.append(scheduler.returncode)
 
+	if options.update_pkg_desc_index:
+		gen_index = GenPkgDescIndex(portdb, os.path.join(
+			repo_config.location, "metadata", "pkg_desc_index"))
+		gen_index.run()
+		ret.append(gen_index.returncode)
+
 	if options.update_use_local_desc:
 		gen_desc = GenUseLocalDesc(portdb,
 			output=options.uld_output,
diff --git a/man/egencache.1 b/man/egencache.1
index f71feb3..3a3197f 100644
--- a/man/egencache.1
+++ b/man/egencache.1
@@ -19,6 +19,10 @@ for the details on package atom syntax.
 .BR "\-\-update\-changelogs"
 Update the ChangeLog files from SCM logs (supported only in git repos).
 .TP
+.BR "\-\-update\-pkg\-desc\-index"
+Update the package description index which is located at
+\fImetadata/pkg_desc_index\fR in the repository.
+.TP
 .BR "\-\-update\-use\-local\-desc"
 Update the \fIprofiles/use.local.desc\fR file from metadata.xml.
 .TP
diff --git a/man/portage.5 b/man/portage.5
index e399f0f..26856d1 100644
--- a/man/portage.5
+++ b/man/portage.5
@@ -75,6 +75,7 @@ user\-defined package sets
 .BR /usr/portage/metadata/
 .nf
 layout.conf
+pkg_desc_index
 .fi
 .TP
 .BR /usr/portage/profiles/
@@ -1110,6 +1111,11 @@ cache\-formats = md5-dict pms
 profile\-formats = portage-2
 .fi
 .RE
+.TP
+.BR pkg_desc_index
+This is an index of packages and descriptions which may be generated
+by \fBegencache\fR(1) in order to optimize \fBemerge\fR(1) search
+actions.
 .RE
 .TP
 .BR /usr/portage/profiles/
diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py
index 4b0fd9f..bf15f11 100644
--- a/pym/_emerge/search.py
+++ b/pym/_emerge/search.py
@@ -3,13 +3,17 @@
 
 from __future__ import print_function
 
+import io
 import re
 import portage
-from portage import os
+from portage import os, _encodings
 from portage.dbapi.porttree import _parse_uri_map
+from portage.dep import Atom
+from portage.exception import InvalidData
 from portage.localization import localized_size
 from portage.output import  bold, bold as white, darkgreen, green, red
 from portage.util import writemsg_stdout
+from portage.versions import _pkg_str
 
 from _emerge.Package import Package
 
@@ -30,7 +34,6 @@ class search(object):
 		The list of available and installed packages is created at object instantiation.
 		This makes successive searches faster."""
 		self.settings = root_config.settings
-		self.vartree = root_config.trees["vartree"]
 		self.spinner = spinner
 		self.verbose = verbose
 		self.searchdesc = searchdesc
@@ -41,9 +44,9 @@ class search(object):
 
 		self._dbs = []
 
-		portdb = root_config.trees["porttree"].dbapi
+		portdb = IndexedPortdb(root_config.trees["porttree"].dbapi)
 		bindb = root_config.trees["bintree"].dbapi
-		vardb = root_config.trees["vartree"].dbapi
+		vardb = IndexedVardb(root_config.trees["vartree"].dbapi)
 
 		if not usepkgonly and portdb._have_root_eclass_dir:
 			self._dbs.append(portdb)
@@ -53,6 +56,7 @@ class search(object):
 
 		self._dbs.append(vardb)
 		self._portdb = portdb
+		self._vardb = vardb
 
 	def _spinner_update(self):
 		if self.spinner:
@@ -97,7 +101,7 @@ class search(object):
 		return {}
 
 	def _visible(self, db, cpv, metadata):
-		installed = db is self.vartree.dbapi
+		installed = db is self._vardb
 		built = installed or db is not self._portdb
 		pkg_type = "ebuild"
 		if installed:
@@ -208,6 +212,20 @@ class search(object):
 					masked=1
 				self.matches["pkg"].append([package,masked])
 			elif self.searchdesc: # DESCRIPTION searching
+				# Check for DESCRIPTION match first, so that we can skip
+				# the expensive visiblity check if it doesn't match.
+				full_package = portage.best(
+					self._xmatch("match-all", package))
+				try:
+					full_desc = self._aux_get(
+						full_package, ["DESCRIPTION"])[0]
+				except KeyError:
+					portage.writemsg(
+						"emerge: search: aux_get() failed, skipping\n",
+						noiselevel=-1)
+					continue
+				if not self.searchre.search(full_desc):
+					continue
 				full_package = self._xmatch("bestmatch-visible", package)
 				if not full_package:
 					#no match found; we don't want to query description
@@ -217,14 +235,8 @@ class search(object):
 						continue
 					else:
 						masked=1
-				try:
-					full_desc = self._aux_get(
-						full_package, ["DESCRIPTION"])[0]
-				except KeyError:
-					print("emerge: search: aux_get() failed, skipping")
-					continue
-				if self.searchre.search(full_desc):
-					self.matches["desc"].append([full_package,masked])
+
+				self.matches["desc"].append((full_package, masked))
 
 		self.sdict = self.setconfig.getSets()
 		for setname in self.sdict:
@@ -262,7 +274,7 @@ class search(object):
 			bold(self.searchkey) + " ]\n")
 		msg.append("[ Applications found : " + \
 			bold(str(self.mlen)) + " ]\n\n")
-		vardb = self.vartree.dbapi
+		vardb = self._vardb
 		metadata_keys = set(Package.metadata_keys)
 		metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"])
 		metadata_keys = tuple(metadata_keys)
@@ -372,7 +384,11 @@ class search(object):
 	# private interface
 	#
 	def getInstallationStatus(self,package):
-		installed_package = self.vartree.dep_bestmatch(package)
+		installed_package = self._vardb.match(package)
+		if installed_package:
+			installed_package = installed_package[-1]
+		else:
+			installed_package = ""
 		result = ""
 		version = self.getVersion(installed_package,search.VERSION_RELEASE)
 		if len(version) > 0:
@@ -392,3 +408,153 @@ class search(object):
 			result = ""
 		return result
 
+
+class IndexedPortdb(object):
+	"""
+	A portdbapi interface that uses a package description index to
+	improve performance. If the description index is missing for a
+	particular repository, then all metadata for that repository is
+	obtained using the normal pordbapi.aux_get method.
+	"""
+	def __init__(self, portdb):
+		self._portdb = portdb
+		self.cpv_exists = portdb.cpv_exists
+		self.getFetchMap = portdb.getFetchMap
+		self.findname = portdb.findname
+		self._aux_cache_keys = portdb._aux_cache_keys
+		self._have_root_eclass_dir = portdb._have_root_eclass_dir
+		self._cpv_sort_ascending = portdb._cpv_sort_ascending
+		self._desc_cache = None
+		self._cp_map = None
+
+	def _init_index(self):
+		cp_map = {}
+		desc_cache = {}
+		for repo_path in self._portdb.porttrees:
+			outside_repo = os.path.join(self._portdb.depcachedir,
+				repo_path.lstrip(os.sep))
+			for parent_dir in (repo_path, outside_repo):
+				file_path = os.path.join(parent_dir,
+					"metadata", "pkg_desc_index")
+
+				try:
+					with io.open(file_path,
+						encoding=_encodings["repo.content"]) as f:
+						for line in f:
+							pkgs, desc = line.split(":", 1)
+							desc = desc.strip()
+							pkgs = pkgs.split(",")
+							if not pkgs[0]:
+								continue
+							try:
+								pkg = _pkg_str(pkgs[0])
+							except InvalidData:
+								continue
+							cp_list = cp_map.get(pkg.cp)
+							if cp_list is None:
+								cp_list = []
+								cp_map[pkg.cp] = cp_list
+							cp_list.append(pkg)
+							for ver in pkgs[1:]:
+								try:
+									cp_list.append(
+										_pkg_str(pkg.cp + "-" + ver))
+								except InvalidData:
+									pass
+							for cpv in cp_list:
+								desc_cache[cpv] = desc
+				except IOError:
+					pass
+				else:
+					break
+			else:
+				# No descriptions index was found, so populate
+				# cp_map the slow way.
+				for cp in self._portdb.cp_all(trees=[repo_path]):
+					cp_list = cp_map.get(cp)
+					if cp_list is None:
+						cp_list = []
+						cp_map[cp] = cp_list
+					for cpv in self._portdb.cp_list(cp, mytree=repo_path):
+						if cpv not in cp_list:
+							cp_list.append(_pkg_str(cpv))
+
+		self._desc_cache = desc_cache
+		self._cp_map = cp_map
+
+	def cp_all(self):
+		if self._cp_map is None:
+			self._init_index()
+		return list(self._cp_map)
+
+	def match(self, atom):
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+		self._portdb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		if len(attrs) == 1 and attrs[0] == "DESCRIPTION":
+			try:
+				return [self._desc_cache[cpv]]
+			except KeyError:
+				pass
+		return self._portdb.aux_get(cpv, attrs)
+
+
+class IndexedVardb(object):
+	"""
+	A vardbapi interface that sacrifices validation in order to
+	improve performance. It takes advantage of vardbdbapi._aux_cache,
+	which is backed by vardb_metadata.pickle. Since _aux_cache is
+	not updated for every single merge/unmerge (see
+	_aux_cache_threshold), the list of packages is obtained directly
+	from the real vardbapi instance. If a package is missing from
+	_aux_cache, then its metadata is obtained using the normal
+	(validated) vardbapi.aux_get method.
+	"""
+	def __init__(self, vardb):
+		self._vardb = vardb
+		self._aux_cache_keys = vardb._aux_cache_keys
+		self._cpv_sort_ascending = vardb._cpv_sort_ascending
+		self._cp_map = {}
+		self.cpv_exists = vardb.cpv_exists
+
+	def cp_all(self):
+		if self._cp_map:
+			return list(self._cp_map)
+		cp_map = self._cp_map
+		for cpv in self._vardb.cpv_all():
+			cp = portage.cpv_getkey(cpv)
+			if cp is not None:
+				cp_list = cp_map.get(cp)
+				if cp_list is None:
+					cp_list = []
+					cp_map[cp] = cp_list
+				cp_list.append(_pkg_str(cpv))
+		return list(cp_map)
+
+	def match(self, atom):
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+		self._vardb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		pkg_data = self._vardb._aux_cache["packages"].get(cpv)
+		if not isinstance(pkg_data, tuple) or \
+			len(pkg_data) != 2 or \
+			not isinstance(pkg_data[1], dict):
+			pkg_data = None
+		if pkg_data is None:
+			# It may be missing from _aux_cache due to
+			# _aux_cache_threshold.
+			return self._vardb.aux_get(cpv, attrs)
+		metadata = pkg_data[1]
+		return [metadata.get(k, "") for k in attrs]
-- 
2.0.4


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] Re: [PATCH] emerge --search: use description index
  2014-10-18  3:28 [gentoo-portage-dev] [PATCH] emerge --search: use description index Zac Medico
@ 2014-10-18  5:59 ` Zac Medico
  2014-10-19 21:51   ` Zac Medico
  2014-11-01 22:46 ` [gentoo-portage-dev] Zac Medico
  1 sibling, 1 reply; 29+ messages in thread
From: Zac Medico @ 2014-10-18  5:59 UTC (permalink / raw
  To: gentoo-portage-dev

This updated patch adds --search-index < y | n >. I'll be maintaining
this patch in the following branch:

	https://github.com/zmedico/portage/tree/bug_525718

From 2aca92f664fd2ff669b77b38a49b06fafbc66b8d Mon Sep 17 00:00:00 2001
From: Zac Medico <zmedico@gentoo.org>
Date: Fri, 17 Oct 2014 17:38:59 -0700
Subject: [PATCH] emerge --search: use description index

This adds an egencache --update-pkg-desc-index action which generates
a plain-text index of package names, versions, and descriptions. The
index can then be used to optimize emerge --search / --searchdesc
actions. If the package description index is missing from a particular
repository, then all metadata for that repository is obtained using the
normal pordbapi.aux_get method.

Searching of installed packages is optimized to take advantage of
vardbdbapi._aux_cache, which is backed by vardb_metadata.pickle.
See the IndexedVardb docstring some more details.

For users that would like to modify ebuilds in a repository without
running egencache afterwards, the new emerge --search-index < y | n >
option can be used to get non-indexed search. Alternatively, the user
could simply remove the stale index file, in order to disable the
search index for a particular repository.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
 bin/egencache          |  43 ++++++++++-
 man/egencache.1        |   4 +
 man/emerge.1           |   8 ++
 man/portage.5          |   6 ++
 pym/_emerge/actions.py |   3 +-
 pym/_emerge/main.py    |   5 ++
 pym/_emerge/search.py  | 198 +++++++++++++++++++++++++++++++++++++++++++++----
 7 files changed, 250 insertions(+), 17 deletions(-)

diff --git a/bin/egencache b/bin/egencache
index e366058..90d5e68 100755
--- a/bin/egencache
+++ b/bin/egencache
@@ -57,7 +57,7 @@ from portage.util._async.run_main_scheduler import run_main_scheduler
 from portage.util._eventloop.global_event_loop import global_event_loop
 from portage import cpv_getkey
 from portage.dep import Atom, isjustname
-from portage.versions import pkgsplit, vercmp
+from portage.versions import pkgsplit, vercmp, _pkg_str
 
 try:
 	from xml.etree import ElementTree
@@ -91,6 +91,9 @@ def parse_args(args):
 	actions.add_argument("--update-changelogs",
 		action="store_true",
 		help="update the ChangeLog files from SCM logs")
+	actions.add_argument("--update-pkg-desc-index",
+		action="store_true",
+		help="update package description index")
 	actions.add_argument("--update-manifests",
 		action="store_true",
 		help="update manifests")
@@ -451,6 +454,35 @@ class GenCache(object):
 		if hasattr(trg_cache, '_prune_empty_dirs'):
 			trg_cache._prune_empty_dirs()
 
+class GenPkgDescIndex(object):
+	def __init__(self, portdb, output_file):
+		self.returncode = os.EX_OK
+		self._portdb = portdb
+		self._output_file = output_file
+
+	def run(self):
+
+		portage.util.ensure_dirs(os.path.dirname(self._output_file))
+		f = portage.util.atomic_ofstream(self._output_file,
+			encoding=_encodings["repo.content"])
+
+		portdb = self._portdb
+		for cp in portdb.cp_all():
+			pkgs = portdb.cp_list(cp)
+			if not pkgs:
+				continue
+			desc, = portdb.aux_get(pkgs[-1], ["DESCRIPTION"])
+
+			if len(pkgs) == 1:
+				output = "%s: %s\n" % (pkgs[0], desc)
+			else:
+				output = "%s,%s: %s\n" % (pkgs[0],
+					",".join(_pkg_str(cpv).version
+					for cpv in pkgs[1:]), desc)
+			f.write(output)
+
+		f.close()
+
 class GenUseLocalDesc(object):
 	def __init__(self, portdb, output=None,
 			preserve_comments=False):
@@ -893,7 +925,8 @@ def egencache_main(args):
 			local_config=False, env=env)
 
 	if not (options.update or options.update_use_local_desc or
-			options.update_changelogs or options.update_manifests):
+			options.update_changelogs or options.update_manifests or
+			options.update_pkg_desc_index):
 		parser.error('No action specified')
 		return 1
 
@@ -1057,6 +1090,12 @@ def egencache_main(args):
 		else:
 			ret.append(scheduler.returncode)
 
+	if options.update_pkg_desc_index:
+		gen_index = GenPkgDescIndex(portdb, os.path.join(
+			repo_config.location, "metadata", "pkg_desc_index"))
+		gen_index.run()
+		ret.append(gen_index.returncode)
+
 	if options.update_use_local_desc:
 		gen_desc = GenUseLocalDesc(portdb,
 			output=options.uld_output,
diff --git a/man/egencache.1 b/man/egencache.1
index f71feb3..3a3197f 100644
--- a/man/egencache.1
+++ b/man/egencache.1
@@ -19,6 +19,10 @@ for the details on package atom syntax.
 .BR "\-\-update\-changelogs"
 Update the ChangeLog files from SCM logs (supported only in git repos).
 .TP
+.BR "\-\-update\-pkg\-desc\-index"
+Update the package description index which is located at
+\fImetadata/pkg_desc_index\fR in the repository.
+.TP
 .BR "\-\-update\-use\-local\-desc"
 Update the \fIprofiles/use.local.desc\fR file from metadata.xml.
 .TP
diff --git a/man/emerge.1 b/man/emerge.1
index 2264b58..efd5d41 100644
--- a/man/emerge.1
+++ b/man/emerge.1
@@ -790,6 +790,14 @@ If ebuilds using EAPIs which \fIdo not\fR support \fBHDEPEND\fR are built in
 the same \fBemerge\fR run as those using EAPIs which \fIdo\fR support
 \fBHDEPEND\fR, this option affects only the former.
 .TP
+.BR "\-\-search\-index < y | n >"
+Enable or disable indexed search for search actions. This option is
+enabled by default. The search index needs to be regenerated by
+\fBegencache\fR(1) after changes are made to a repository (see the
+\fB\-\-update\-pkg\-desc\-index\fR action). This setting can be added
+to \fBEMERGE_DEFAULT_OPTS\fR (see \fBmake.conf\fR(5)) and later
+overridden via the command line.
+.TP
 .BR "\-\-select [ y | n ] (\-w short option)"
 Add specified packages to the world set (inverse of
 \fB\-\-oneshot\fR). This is useful if you want to
diff --git a/man/portage.5 b/man/portage.5
index e399f0f..26856d1 100644
--- a/man/portage.5
+++ b/man/portage.5
@@ -75,6 +75,7 @@ user\-defined package sets
 .BR /usr/portage/metadata/
 .nf
 layout.conf
+pkg_desc_index
 .fi
 .TP
 .BR /usr/portage/profiles/
@@ -1110,6 +1111,11 @@ cache\-formats = md5-dict pms
 profile\-formats = portage-2
 .fi
 .RE
+.TP
+.BR pkg_desc_index
+This is an index of packages and descriptions which may be generated
+by \fBegencache\fR(1) in order to optimize \fBemerge\fR(1) search
+actions.
 .RE
 .TP
 .BR /usr/portage/profiles/
diff --git a/pym/_emerge/actions.py b/pym/_emerge/actions.py
index 4e8b83b..a81212c 100644
--- a/pym/_emerge/actions.py
+++ b/pym/_emerge/actions.py
@@ -2015,7 +2015,8 @@ def action_search(root_config, myopts, myfiles, spinner):
 		searchinstance = search(root_config,
 			spinner, "--searchdesc" in myopts,
 			"--quiet" not in myopts, "--usepkg" in myopts,
-			"--usepkgonly" in myopts)
+			"--usepkgonly" in myopts,
+			search_index = myopts.get("--search-index", "y") != "n")
 		for mysearch in myfiles:
 			try:
 				searchinstance.execute(mysearch)
diff --git a/pym/_emerge/main.py b/pym/_emerge/main.py
index 3883f72..d403b36 100644
--- a/pym/_emerge/main.py
+++ b/pym/_emerge/main.py
@@ -616,6 +616,11 @@ def parse_opts(tmpcmdline, silent=False):
 			"choices" :("True", "rdeps")
 		},
 
+		"--search-index": {
+			"help": "Enable or disable indexed search (enabled by default)",
+			"choices": y_or_n
+		},
+
 		"--select": {
 			"shortopt" : "-w",
 			"help"    : "add specified packages to the world set " + \
diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py
index 4b0fd9f..007abf2 100644
--- a/pym/_emerge/search.py
+++ b/pym/_emerge/search.py
@@ -3,13 +3,17 @@
 
 from __future__ import print_function
 
+import io
 import re
 import portage
-from portage import os
+from portage import os, _encodings
 from portage.dbapi.porttree import _parse_uri_map
+from portage.dep import Atom
+from portage.exception import InvalidData
 from portage.localization import localized_size
 from portage.output import  bold, bold as white, darkgreen, green, red
 from portage.util import writemsg_stdout
+from portage.versions import _pkg_str
 
 from _emerge.Package import Package
 
@@ -25,12 +29,11 @@ class search(object):
 	# public interface
 	#
 	def __init__(self, root_config, spinner, searchdesc,
-		verbose, usepkg, usepkgonly):
+		verbose, usepkg, usepkgonly, search_index=True):
 		"""Searches the available and installed packages for the supplied search key.
 		The list of available and installed packages is created at object instantiation.
 		This makes successive searches faster."""
 		self.settings = root_config.settings
-		self.vartree = root_config.trees["vartree"]
 		self.spinner = spinner
 		self.verbose = verbose
 		self.searchdesc = searchdesc
@@ -45,6 +48,10 @@ class search(object):
 		bindb = root_config.trees["bintree"].dbapi
 		vardb = root_config.trees["vartree"].dbapi
 
+		if search_index:
+			portdb = IndexedPortdb(portdb)
+			vardb = IndexedVardb(vardb)
+
 		if not usepkgonly and portdb._have_root_eclass_dir:
 			self._dbs.append(portdb)
 
@@ -53,6 +60,7 @@ class search(object):
 
 		self._dbs.append(vardb)
 		self._portdb = portdb
+		self._vardb = vardb
 
 	def _spinner_update(self):
 		if self.spinner:
@@ -97,7 +105,7 @@ class search(object):
 		return {}
 
 	def _visible(self, db, cpv, metadata):
-		installed = db is self.vartree.dbapi
+		installed = db is self._vardb
 		built = installed or db is not self._portdb
 		pkg_type = "ebuild"
 		if installed:
@@ -208,6 +216,20 @@ class search(object):
 					masked=1
 				self.matches["pkg"].append([package,masked])
 			elif self.searchdesc: # DESCRIPTION searching
+				# Check for DESCRIPTION match first, so that we can skip
+				# the expensive visiblity check if it doesn't match.
+				full_package = portage.best(
+					self._xmatch("match-all", package))
+				try:
+					full_desc = self._aux_get(
+						full_package, ["DESCRIPTION"])[0]
+				except KeyError:
+					portage.writemsg(
+						"emerge: search: aux_get() failed, skipping\n",
+						noiselevel=-1)
+					continue
+				if not self.searchre.search(full_desc):
+					continue
 				full_package = self._xmatch("bestmatch-visible", package)
 				if not full_package:
 					#no match found; we don't want to query description
@@ -217,14 +239,8 @@ class search(object):
 						continue
 					else:
 						masked=1
-				try:
-					full_desc = self._aux_get(
-						full_package, ["DESCRIPTION"])[0]
-				except KeyError:
-					print("emerge: search: aux_get() failed, skipping")
-					continue
-				if self.searchre.search(full_desc):
-					self.matches["desc"].append([full_package,masked])
+
+				self.matches["desc"].append((full_package, masked))
 
 		self.sdict = self.setconfig.getSets()
 		for setname in self.sdict:
@@ -262,7 +278,7 @@ class search(object):
 			bold(self.searchkey) + " ]\n")
 		msg.append("[ Applications found : " + \
 			bold(str(self.mlen)) + " ]\n\n")
-		vardb = self.vartree.dbapi
+		vardb = self._vardb
 		metadata_keys = set(Package.metadata_keys)
 		metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"])
 		metadata_keys = tuple(metadata_keys)
@@ -372,7 +388,11 @@ class search(object):
 	# private interface
 	#
 	def getInstallationStatus(self,package):
-		installed_package = self.vartree.dep_bestmatch(package)
+		installed_package = self._vardb.match(package)
+		if installed_package:
+			installed_package = installed_package[-1]
+		else:
+			installed_package = ""
 		result = ""
 		version = self.getVersion(installed_package,search.VERSION_RELEASE)
 		if len(version) > 0:
@@ -392,3 +412,153 @@ class search(object):
 			result = ""
 		return result
 
+
+class IndexedPortdb(object):
+	"""
+	A portdbapi interface that uses a package description index to
+	improve performance. If the description index is missing for a
+	particular repository, then all metadata for that repository is
+	obtained using the normal pordbapi.aux_get method.
+	"""
+	def __init__(self, portdb):
+		self._portdb = portdb
+		self.cpv_exists = portdb.cpv_exists
+		self.getFetchMap = portdb.getFetchMap
+		self.findname = portdb.findname
+		self._aux_cache_keys = portdb._aux_cache_keys
+		self._have_root_eclass_dir = portdb._have_root_eclass_dir
+		self._cpv_sort_ascending = portdb._cpv_sort_ascending
+		self._desc_cache = None
+		self._cp_map = None
+
+	def _init_index(self):
+		cp_map = {}
+		desc_cache = {}
+		for repo_path in self._portdb.porttrees:
+			outside_repo = os.path.join(self._portdb.depcachedir,
+				repo_path.lstrip(os.sep))
+			for parent_dir in (repo_path, outside_repo):
+				file_path = os.path.join(parent_dir,
+					"metadata", "pkg_desc_index")
+
+				try:
+					with io.open(file_path,
+						encoding=_encodings["repo.content"]) as f:
+						for line in f:
+							pkgs, desc = line.split(":", 1)
+							desc = desc.strip()
+							pkgs = pkgs.split(",")
+							if not pkgs[0]:
+								continue
+							try:
+								pkg = _pkg_str(pkgs[0])
+							except InvalidData:
+								continue
+							cp_list = cp_map.get(pkg.cp)
+							if cp_list is None:
+								cp_list = []
+								cp_map[pkg.cp] = cp_list
+							cp_list.append(pkg)
+							for ver in pkgs[1:]:
+								try:
+									cp_list.append(
+										_pkg_str(pkg.cp + "-" + ver))
+								except InvalidData:
+									pass
+							for cpv in cp_list:
+								desc_cache[cpv] = desc
+				except IOError:
+					pass
+				else:
+					break
+			else:
+				# No descriptions index was found, so populate
+				# cp_map the slow way.
+				for cp in self._portdb.cp_all(trees=[repo_path]):
+					cp_list = cp_map.get(cp)
+					if cp_list is None:
+						cp_list = []
+						cp_map[cp] = cp_list
+					for cpv in self._portdb.cp_list(cp, mytree=repo_path):
+						if cpv not in cp_list:
+							cp_list.append(_pkg_str(cpv))
+
+		self._desc_cache = desc_cache
+		self._cp_map = cp_map
+
+	def cp_all(self):
+		if self._cp_map is None:
+			self._init_index()
+		return list(self._cp_map)
+
+	def match(self, atom):
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+		self._portdb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		if len(attrs) == 1 and attrs[0] == "DESCRIPTION":
+			try:
+				return [self._desc_cache[cpv]]
+			except KeyError:
+				pass
+		return self._portdb.aux_get(cpv, attrs)
+
+
+class IndexedVardb(object):
+	"""
+	A vardbapi interface that sacrifices validation in order to
+	improve performance. It takes advantage of vardbdbapi._aux_cache,
+	which is backed by vardb_metadata.pickle. Since _aux_cache is
+	not updated for every single merge/unmerge (see
+	_aux_cache_threshold), the list of packages is obtained directly
+	from the real vardbapi instance. If a package is missing from
+	_aux_cache, then its metadata is obtained using the normal
+	(validated) vardbapi.aux_get method.
+	"""
+	def __init__(self, vardb):
+		self._vardb = vardb
+		self._aux_cache_keys = vardb._aux_cache_keys
+		self._cpv_sort_ascending = vardb._cpv_sort_ascending
+		self._cp_map = {}
+		self.cpv_exists = vardb.cpv_exists
+
+	def cp_all(self):
+		if self._cp_map:
+			return list(self._cp_map)
+		cp_map = self._cp_map
+		for cpv in self._vardb.cpv_all():
+			cp = portage.cpv_getkey(cpv)
+			if cp is not None:
+				cp_list = cp_map.get(cp)
+				if cp_list is None:
+					cp_list = []
+					cp_map[cp] = cp_list
+				cp_list.append(_pkg_str(cpv))
+		return list(cp_map)
+
+	def match(self, atom):
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+		self._vardb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		pkg_data = self._vardb._aux_cache["packages"].get(cpv)
+		if not isinstance(pkg_data, tuple) or \
+			len(pkg_data) != 2 or \
+			not isinstance(pkg_data[1], dict):
+			pkg_data = None
+		if pkg_data is None:
+			# It may be missing from _aux_cache due to
+			# _aux_cache_threshold.
+			return self._vardb.aux_get(cpv, attrs)
+		metadata = pkg_data[1]
+		return [metadata.get(k, "") for k in attrs]
-- 
2.0.4


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] Re: [PATCH] emerge --search: use description index
  2014-10-18  5:59 ` [gentoo-portage-dev] " Zac Medico
@ 2014-10-19 21:51   ` Zac Medico
  2014-10-23  8:55     ` Brian Dolbec
  0 siblings, 1 reply; 29+ messages in thread
From: Zac Medico @ 2014-10-19 21:51 UTC (permalink / raw
  To: gentoo-portage-dev

This updated patch changes the index format to use spaces instead of
commas, for readability. This example given in man/portage.5:

sys-apps/sed 4.2 4.2.1 4.2.1-r1 4.2.2: Super-useful stream editor
sys-apps/usleep 0.1: A wrapper for usleep

Hopefully that's easier on the eyes (thanks to Michał Górny for the
suggestion).

Also, Michał has brought it to my attention that git will send the
whole file instead of the delta, unless an expensive `git repack`
operation is performed. Maybe it's possible to repack the user.git
each time the index is generated? Currently, the master rsync mirror
runs egencache every 30 minutes. If user.git syncs at the same
interval, it would need to be repacked at the same interval.

Anyway, it would be nice to merge this patch, even if we don't have
the resources now to generate the index for gentoo on the server side.
We could follow up this patch later with a post emerge --sync hook
for client-side index generation.


From 5192579f79da36f5a1ce5f3651c26ccb235cca28 Mon Sep 17 00:00:00 2001
From: Zac Medico <zmedico@gentoo.org>
Date: Fri, 17 Oct 2014 17:38:59 -0700
Subject: [PATCH] emerge --search: use description index

This adds an egencache --update-pkg-desc-index action which generates
a plain-text index of package names, versions, and descriptions. The
index can then be used to optimize emerge --search / --searchdesc
actions. If the package description index is missing from a particular
repository, then all metadata for that repository is obtained using the
normal pordbapi.aux_get method.

Searching of installed packages is optimized to take advantage of
vardbdbapi._aux_cache, which is backed by vdb_metadata.pickle.
See the IndexedVardb docstring for some more details.

For users that would like to modify ebuilds in a repository without
running egencache afterwards, the new emerge --search-index < y | n >
option can be used to get non-indexed search. Alternatively, the user
could simply remove the stale index file, in order to disable the
search index for a particular repository.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
 bin/egencache          |  39 +++++++++-
 man/egencache.1        |   4 +
 man/emerge.1           |   8 ++
 man/portage.5          |  12 +++
 pym/_emerge/actions.py |   3 +-
 pym/_emerge/main.py    |   5 ++
 pym/_emerge/search.py  | 207 +++++++++++++++++++++++++++++++++++++++++++++----
 7 files changed, 261 insertions(+), 17 deletions(-)

diff --git a/bin/egencache b/bin/egencache
index e366058..95cb1ad 100755
--- a/bin/egencache
+++ b/bin/egencache
@@ -57,7 +57,7 @@ from portage.util._async.run_main_scheduler import run_main_scheduler
 from portage.util._eventloop.global_event_loop import global_event_loop
 from portage import cpv_getkey
 from portage.dep import Atom, isjustname
-from portage.versions import pkgsplit, vercmp
+from portage.versions import pkgsplit, vercmp, _pkg_str
 
 try:
 	from xml.etree import ElementTree
@@ -91,6 +91,9 @@ def parse_args(args):
 	actions.add_argument("--update-changelogs",
 		action="store_true",
 		help="update the ChangeLog files from SCM logs")
+	actions.add_argument("--update-pkg-desc-index",
+		action="store_true",
+		help="update package description index")
 	actions.add_argument("--update-manifests",
 		action="store_true",
 		help="update manifests")
@@ -451,6 +454,31 @@ class GenCache(object):
 		if hasattr(trg_cache, '_prune_empty_dirs'):
 			trg_cache._prune_empty_dirs()
 
+class GenPkgDescIndex(object):
+	def __init__(self, portdb, output_file):
+		self.returncode = os.EX_OK
+		self._portdb = portdb
+		self._output_file = output_file
+
+	def run(self):
+
+		portage.util.ensure_dirs(os.path.dirname(self._output_file))
+		f = portage.util.atomic_ofstream(self._output_file,
+			encoding=_encodings["repo.content"])
+
+		portdb = self._portdb
+		for cp in portdb.cp_all():
+			pkgs = portdb.cp_list(cp)
+			if not pkgs:
+				continue
+			desc, = portdb.aux_get(pkgs[-1], ["DESCRIPTION"])
+
+			f.write("%s %s: %s\n" % (cp,
+				" ".join(_pkg_str(cpv).version
+				for cpv in pkgs), desc))
+
+		f.close()
+
 class GenUseLocalDesc(object):
 	def __init__(self, portdb, output=None,
 			preserve_comments=False):
@@ -893,7 +921,8 @@ def egencache_main(args):
 			local_config=False, env=env)
 
 	if not (options.update or options.update_use_local_desc or
-			options.update_changelogs or options.update_manifests):
+			options.update_changelogs or options.update_manifests or
+			options.update_pkg_desc_index):
 		parser.error('No action specified')
 		return 1
 
@@ -1057,6 +1086,12 @@ def egencache_main(args):
 		else:
 			ret.append(scheduler.returncode)
 
+	if options.update_pkg_desc_index:
+		gen_index = GenPkgDescIndex(portdb, os.path.join(
+			repo_config.location, "metadata", "pkg_desc_index"))
+		gen_index.run()
+		ret.append(gen_index.returncode)
+
 	if options.update_use_local_desc:
 		gen_desc = GenUseLocalDesc(portdb,
 			output=options.uld_output,
diff --git a/man/egencache.1 b/man/egencache.1
index f71feb3..3a3197f 100644
--- a/man/egencache.1
+++ b/man/egencache.1
@@ -19,6 +19,10 @@ for the details on package atom syntax.
 .BR "\-\-update\-changelogs"
 Update the ChangeLog files from SCM logs (supported only in git repos).
 .TP
+.BR "\-\-update\-pkg\-desc\-index"
+Update the package description index which is located at
+\fImetadata/pkg_desc_index\fR in the repository.
+.TP
 .BR "\-\-update\-use\-local\-desc"
 Update the \fIprofiles/use.local.desc\fR file from metadata.xml.
 .TP
diff --git a/man/emerge.1 b/man/emerge.1
index 2264b58..efd5d41 100644
--- a/man/emerge.1
+++ b/man/emerge.1
@@ -790,6 +790,14 @@ If ebuilds using EAPIs which \fIdo not\fR support \fBHDEPEND\fR are built in
 the same \fBemerge\fR run as those using EAPIs which \fIdo\fR support
 \fBHDEPEND\fR, this option affects only the former.
 .TP
+.BR "\-\-search\-index < y | n >"
+Enable or disable indexed search for search actions. This option is
+enabled by default. The search index needs to be regenerated by
+\fBegencache\fR(1) after changes are made to a repository (see the
+\fB\-\-update\-pkg\-desc\-index\fR action). This setting can be added
+to \fBEMERGE_DEFAULT_OPTS\fR (see \fBmake.conf\fR(5)) and later
+overridden via the command line.
+.TP
 .BR "\-\-select [ y | n ] (\-w short option)"
 Add specified packages to the world set (inverse of
 \fB\-\-oneshot\fR). This is useful if you want to
diff --git a/man/portage.5 b/man/portage.5
index e399f0f..bf9457c 100644
--- a/man/portage.5
+++ b/man/portage.5
@@ -75,6 +75,7 @@ user\-defined package sets
 .BR /usr/portage/metadata/
 .nf
 layout.conf
+pkg_desc_index
 .fi
 .TP
 .BR /usr/portage/profiles/
@@ -1110,6 +1111,17 @@ cache\-formats = md5-dict pms
 profile\-formats = portage-2
 .fi
 .RE
+.TP
+.BR pkg_desc_index
+This is an index of package names, versions, and descriptions which
+may be generated by \fBegencache\fR(1) in order to optimize
+\fBemerge\fR(1) search actions.
+
+.I Example:
+.nf
+sys-apps/sed 4.2 4.2.1 4.2.1-r1 4.2.2: Super-useful stream editor
+sys-apps/usleep 0.1: A wrapper for usleep
+.fi
 .RE
 .TP
 .BR /usr/portage/profiles/
diff --git a/pym/_emerge/actions.py b/pym/_emerge/actions.py
index 4e8b83b..a81212c 100644
--- a/pym/_emerge/actions.py
+++ b/pym/_emerge/actions.py
@@ -2015,7 +2015,8 @@ def action_search(root_config, myopts, myfiles, spinner):
 		searchinstance = search(root_config,
 			spinner, "--searchdesc" in myopts,
 			"--quiet" not in myopts, "--usepkg" in myopts,
-			"--usepkgonly" in myopts)
+			"--usepkgonly" in myopts,
+			search_index = myopts.get("--search-index", "y") != "n")
 		for mysearch in myfiles:
 			try:
 				searchinstance.execute(mysearch)
diff --git a/pym/_emerge/main.py b/pym/_emerge/main.py
index 3883f72..d403b36 100644
--- a/pym/_emerge/main.py
+++ b/pym/_emerge/main.py
@@ -616,6 +616,11 @@ def parse_opts(tmpcmdline, silent=False):
 			"choices" :("True", "rdeps")
 		},
 
+		"--search-index": {
+			"help": "Enable or disable indexed search (enabled by default)",
+			"choices": y_or_n
+		},
+
 		"--select": {
 			"shortopt" : "-w",
 			"help"    : "add specified packages to the world set " + \
diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py
index 4b0fd9f..37fee20 100644
--- a/pym/_emerge/search.py
+++ b/pym/_emerge/search.py
@@ -3,13 +3,17 @@
 
 from __future__ import print_function
 
+import io
 import re
 import portage
-from portage import os
+from portage import os, _encodings
 from portage.dbapi.porttree import _parse_uri_map
+from portage.dep import Atom
+from portage.exception import InvalidAtom, InvalidData
 from portage.localization import localized_size
 from portage.output import  bold, bold as white, darkgreen, green, red
 from portage.util import writemsg_stdout
+from portage.versions import _pkg_str
 
 from _emerge.Package import Package
 
@@ -25,12 +29,11 @@ class search(object):
 	# public interface
 	#
 	def __init__(self, root_config, spinner, searchdesc,
-		verbose, usepkg, usepkgonly):
+		verbose, usepkg, usepkgonly, search_index = True):
 		"""Searches the available and installed packages for the supplied search key.
 		The list of available and installed packages is created at object instantiation.
 		This makes successive searches faster."""
 		self.settings = root_config.settings
-		self.vartree = root_config.trees["vartree"]
 		self.spinner = spinner
 		self.verbose = verbose
 		self.searchdesc = searchdesc
@@ -45,6 +48,10 @@ class search(object):
 		bindb = root_config.trees["bintree"].dbapi
 		vardb = root_config.trees["vartree"].dbapi
 
+		if search_index:
+			portdb = IndexedPortdb(portdb)
+			vardb = IndexedVardb(vardb)
+
 		if not usepkgonly and portdb._have_root_eclass_dir:
 			self._dbs.append(portdb)
 
@@ -53,6 +60,7 @@ class search(object):
 
 		self._dbs.append(vardb)
 		self._portdb = portdb
+		self._vardb = vardb
 
 	def _spinner_update(self):
 		if self.spinner:
@@ -97,7 +105,7 @@ class search(object):
 		return {}
 
 	def _visible(self, db, cpv, metadata):
-		installed = db is self.vartree.dbapi
+		installed = db is self._vardb
 		built = installed or db is not self._portdb
 		pkg_type = "ebuild"
 		if installed:
@@ -208,6 +216,22 @@ class search(object):
 					masked=1
 				self.matches["pkg"].append([package,masked])
 			elif self.searchdesc: # DESCRIPTION searching
+				# Check for DESCRIPTION match first, so that we can skip
+				# the expensive visiblity check if it doesn't match.
+				full_package = self._xmatch("match-all", package)
+				if not full_package:
+					continue
+				full_package = full_package[-1]
+				try:
+					full_desc = self._aux_get(
+						full_package, ["DESCRIPTION"])[0]
+				except KeyError:
+					portage.writemsg(
+						"emerge: search: aux_get() failed, skipping\n",
+						noiselevel=-1)
+					continue
+				if not self.searchre.search(full_desc):
+					continue
 				full_package = self._xmatch("bestmatch-visible", package)
 				if not full_package:
 					#no match found; we don't want to query description
@@ -217,14 +241,8 @@ class search(object):
 						continue
 					else:
 						masked=1
-				try:
-					full_desc = self._aux_get(
-						full_package, ["DESCRIPTION"])[0]
-				except KeyError:
-					print("emerge: search: aux_get() failed, skipping")
-					continue
-				if self.searchre.search(full_desc):
-					self.matches["desc"].append([full_package,masked])
+
+				self.matches["desc"].append((full_package, masked))
 
 		self.sdict = self.setconfig.getSets()
 		for setname in self.sdict:
@@ -262,7 +280,7 @@ class search(object):
 			bold(self.searchkey) + " ]\n")
 		msg.append("[ Applications found : " + \
 			bold(str(self.mlen)) + " ]\n\n")
-		vardb = self.vartree.dbapi
+		vardb = self._vardb
 		metadata_keys = set(Package.metadata_keys)
 		metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"])
 		metadata_keys = tuple(metadata_keys)
@@ -372,7 +390,11 @@ class search(object):
 	# private interface
 	#
 	def getInstallationStatus(self,package):
-		installed_package = self.vartree.dep_bestmatch(package)
+		installed_package = self._vardb.match(package)
+		if installed_package:
+			installed_package = installed_package[-1]
+		else:
+			installed_package = ""
 		result = ""
 		version = self.getVersion(installed_package,search.VERSION_RELEASE)
 		if len(version) > 0:
@@ -392,3 +414,160 @@ class search(object):
 			result = ""
 		return result
 
+
+class IndexedPortdb(object):
+	"""
+	A portdbapi interface that uses a package description index to
+	improve performance. If the description index is missing for a
+	particular repository, then all metadata for that repository is
+	obtained using the normal pordbapi.aux_get method.
+	"""
+	def __init__(self, portdb):
+		self._portdb = portdb
+		self.cpv_exists = portdb.cpv_exists
+		self.getFetchMap = portdb.getFetchMap
+		self.findname = portdb.findname
+		self._aux_cache_keys = portdb._aux_cache_keys
+		self._have_root_eclass_dir = portdb._have_root_eclass_dir
+		self._cpv_sort_ascending = portdb._cpv_sort_ascending
+		self._desc_cache = None
+		self._cp_map = None
+
+	def _init_index(self):
+		cp_map = {}
+		desc_cache = {}
+		for repo_path in self._portdb.porttrees:
+			outside_repo = os.path.join(self._portdb.depcachedir,
+				repo_path.lstrip(os.sep))
+			for parent_dir in (repo_path, outside_repo):
+				file_path = os.path.join(parent_dir,
+					"metadata", "pkg_desc_index")
+
+				try:
+					with io.open(file_path,
+						encoding=_encodings["repo.content"]) as f:
+						for line in f:
+							try:
+								pkgs, desc = line.split(":", 1)
+							except ValueError:
+								continue
+							desc = desc.strip()
+							try:
+								cp, pkgs = pkgs.split(" ", 1)
+							except ValueError:
+								continue
+							if not cp:
+								continue
+							try:
+								atom = Atom(cp)
+							except InvalidAtom:
+								continue
+							if cp != atom.cp:
+								continue
+							cp_list = cp_map.get(cp)
+							if cp_list is None:
+								cp_list = []
+								cp_map[cp] = cp_list
+							for ver in pkgs.split():
+								try:
+									cpv = _pkg_str(cp + "-" + ver)
+								except InvalidData:
+									pass
+								else:
+									cp_list.append(cpv)
+									desc_cache[cpv] = desc
+				except IOError:
+					pass
+				else:
+					break
+			else:
+				# No descriptions index was found, so populate
+				# cp_map the slow way.
+				for cp in self._portdb.cp_all(trees=[repo_path]):
+					cp_list = cp_map.get(cp)
+					if cp_list is None:
+						cp_list = []
+						cp_map[cp] = cp_list
+					for cpv in self._portdb.cp_list(cp, mytree=repo_path):
+						if cpv not in cp_list:
+							cp_list.append(_pkg_str(cpv))
+
+		self._desc_cache = desc_cache
+		self._cp_map = cp_map
+
+	def cp_all(self):
+		if self._cp_map is None:
+			self._init_index()
+		return list(self._cp_map)
+
+	def match(self, atom):
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+		self._portdb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		if len(attrs) == 1 and attrs[0] == "DESCRIPTION":
+			try:
+				return [self._desc_cache[cpv]]
+			except KeyError:
+				pass
+		return self._portdb.aux_get(cpv, attrs)
+
+
+class IndexedVardb(object):
+	"""
+	A vardbapi interface that sacrifices validation in order to
+	improve performance. It takes advantage of vardbdbapi._aux_cache,
+	which is backed by vdb_metadata.pickle. Since _aux_cache is
+	not updated for every single merge/unmerge (see
+	_aux_cache_threshold), the list of packages is obtained directly
+	from the real vardbapi instance. If a package is missing from
+	_aux_cache, then its metadata is obtained using the normal
+	(validated) vardbapi.aux_get method.
+	"""
+	def __init__(self, vardb):
+		self._vardb = vardb
+		self._aux_cache_keys = vardb._aux_cache_keys
+		self._cpv_sort_ascending = vardb._cpv_sort_ascending
+		self._cp_map = {}
+		self.cpv_exists = vardb.cpv_exists
+
+	def cp_all(self):
+		if self._cp_map:
+			return list(self._cp_map)
+		cp_map = self._cp_map
+		for cpv in self._vardb.cpv_all():
+			cp = portage.cpv_getkey(cpv)
+			if cp is not None:
+				cp_list = cp_map.get(cp)
+				if cp_list is None:
+					cp_list = []
+					cp_map[cp] = cp_list
+				cp_list.append(_pkg_str(cpv))
+		return list(cp_map)
+
+	def match(self, atom):
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+		self._vardb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		pkg_data = self._vardb._aux_cache["packages"].get(cpv)
+		if not isinstance(pkg_data, tuple) or \
+			len(pkg_data) != 2 or \
+			not isinstance(pkg_data[1], dict):
+			pkg_data = None
+		if pkg_data is None:
+			# It may be missing from _aux_cache due to
+			# _aux_cache_threshold.
+			return self._vardb.aux_get(cpv, attrs)
+		metadata = pkg_data[1]
+		return [metadata.get(k, "") for k in attrs]
-- 
2.0.4


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [gentoo-portage-dev] Re: [PATCH] emerge --search: use description index
  2014-10-19 21:51   ` Zac Medico
@ 2014-10-23  8:55     ` Brian Dolbec
  2014-10-23  9:22       ` Zac Medico
  0 siblings, 1 reply; 29+ messages in thread
From: Brian Dolbec @ 2014-10-23  8:55 UTC (permalink / raw
  To: gentoo-portage-dev

On Sun, 19 Oct 2014 14:51:05 -0700
Zac Medico <zmedico@gentoo.org> wrote:

> This updated patch changes the index format to use spaces instead of
> commas, for readability. This example given in man/portage.5:
> 
> sys-apps/sed 4.2 4.2.1 4.2.1-r1 4.2.2: Super-useful stream editor
> sys-apps/usleep 0.1: A wrapper for usleep
> 
> Hopeful057,6 +1086,12 @@ def egencache_main(args):
>
...
/snip

All the above looks good to me





> b/pym/_emerge/search.py index 4b0fd9f..37fee20 100644
> --- a/pym/_emerge/search.py
> +++ b/pym/_emerge/search.py
> @@ -3,13 +3,17 @@
>  
>  from __future__ import print_function
>  
> +import io
>  import re
>  import portage
> -from portage import os
> +from portage import os, _encodings
>  from portage.dbapi.porttree import _parse_uri_map
> +from portage.dep import Atom
> +from portage.exception import InvalidAtom, InvalidData
>  from portage.localization import localized_size
>  from portage.output import  bold, bold as white, darkgreen, green,
> red from portage.util import writemsg_stdout
> +from portage.versions import _pkg_str
>  
>  from _emerge.Package import Package
>  
> @@ -25,12 +29,11 @@ class search(object):
>  	# public interface
>  	#
>  	def __init__(self, root_config, spinner, searchdesc,
> -		verbose, usepkg, usepkgonly):
> +		verbose, usepkg, usepkgonly, search_index = True):
>  		"""Searches the available and installed packages for
> the supplied search key. The list of available and installed packages
> is created at object instantiation. This makes successive searches
> faster.""" self.settings = root_config.settings
> -		self.vartree = root_config.trees["vartree"]
>  		self.spinner = spinner
>  		self.verbose = verbose
>  		self.searchdesc = searchdesc
> @@ -45,6 +48,10 @@ class search(object):
>  		bindb = root_config.trees["bintree"].dbapi
>  		vardb = root_config.trees["vartree"].dbapi
>  
> +		if search_index:
> +			portdb = IndexedPortdb(portdb)
> +			vardb = IndexedVardb(vardb)
> +
>  		if not usepkgonly and portdb._have_root_eclass_dir:
>  			self._dbs.append(portdb)
>  
> @@ -53,6 +60,7 @@ class search(object):
>  
>  		self._dbs.append(vardb)
>  		self._portdb = portdb
> +		self._vardb = vardb
>  
>  	def _spinner_update(self):
>  		if self.spinner:
> @@ -97,7 +105,7 @@ class search(object):
>  		return {}
>  
>  	def _visible(self, db, cpv, metadata):
> -		installed = db is self.vartree.dbapi
> +		installed = db is self._vardb
>  		built = installed or db is not self._portdb
>  		pkg_type = "ebuild"
>  		if installed:
> @@ -208,6 +216,22 @@ class search(object):
>  					masked=1
>  				self.matches["pkg"].append([package,masked])
>  			elif self.searchdesc: # DESCRIPTION searching
> +				# Check for DESCRIPTION match first,
> so that we can skip
> +				# the expensive visiblity check if
> it doesn't match.
> +				full_package =
> self._xmatch("match-all", package)
> +				if not full_package:
> +					continue
> +				full_package = full_package[-1]
> +				try:
> +					full_desc = self._aux_get(
> +						full_package,
> ["DESCRIPTION"])[0]
> +				except KeyError:
> +					portage.writemsg(
> +						"emerge: search:
> aux_get() failed, skipping\n",
> +						noiselevel=-1)
> +					continue
> +				if not
> self.searchre.search(full_desc):
> +					continue
>  				full_package =
> self._xmatch("bestmatch-visible", package) if not full_package:
>  					#no match found; we don't
> want to query description @@ -217,14 +241,8 @@ class search(object):
>  						continue
>  					else:
>  						masked=1
> -				try:
> -					full_desc = self._aux_get(
> -						full_package,
> ["DESCRIPTION"])[0]
> -				except KeyError:
> -					print("emerge: search:
> aux_get() failed, skipping")
> -					continue
> -				if self.searchre.search(full_desc):
> -
> self.matches["desc"].append([full_package,masked]) +
> +
> self.matches["desc"].append((full_package, masked)) 
>  		self.sdict = self.setconfig.getSets()
>  		for setname in self.sdict:
> @@ -262,7 +280,7 @@ class search(object):
>  			bold(self.searchkey) + " ]\n")
>  		msg.append("[ Applications found : " + \
>  			bold(str(self.mlen)) + " ]\n\n")
> -		vardb = self.vartree.dbapi
> +		vardb = self._vardb
>  		metadata_keys = set(Package.metadata_keys)
>  		metadata_keys.update(["DESCRIPTION", "HOMEPAGE",
> "LICENSE", "SRC_URI"]) metadata_keys = tuple(metadata_keys)
> @@ -372,7 +390,11 @@ class search(object):
>  	# private interface
>  	#
>  	def getInstallationStatus(self,package):
> -		installed_package =
> self.vartree.dep_bestmatch(package)
> +		installed_package = self._vardb.match(package)
> +		if installed_package:
> +			installed_package = installed_package[-1]
> +		else:
> +			installed_package = ""
>  		result = ""
>  		version =
> self.getVersion(installed_package,search.VERSION_RELEASE) if
> len(version) > 0: @@ -392,3 +414,160 @@ class search(object):
>  			result = ""
>  		return result
> 








What I wonder, is why the following two classes aren't in the portage
namespace.  There is far too much logic embedded in the _emerge
namespace.  Most probably under the portage/dpapi subpkg. Looking at
them, they do look very similar to the portdbapi and vardbapi classes.
They are just stripped down and optimised for this data.  They also
don't seem to use any _emerge specific namespace modules that I saw.

Perhaps with a file name of index.py or indexers.py

 
> +
> +class IndexedPortdb(object):
> +	"""
> +	A portdbapi interface that uses a package description index
> to


   *** See ^^^  even the second word of the class description seems
   to agree with me :) 


> +	improve performance. If the description index is missing for
> a
> +	particular repository, then all metadata for that repository
> is
> +	obtained using the normal pordbapi.aux_get method.
> +	"""
> +	def __init__(self, portdb):
> +		self._portdb = portdb
> +		self.cpv_exists = portdb.cpv_exists
> +		self.getFetchMap = portdb.getFetchMap
> +		self.findname = portdb.findname
> +		self._aux_cache_keys = portdb._aux_cache_keys
> +		self._have_root_eclass_dir =
> portdb._have_root_eclass_dir
> +		self._cpv_sort_ascending = portdb._cpv_sort_ascending
> +		self._desc_cache = None
> +		self._cp_map = None
> +
> +	def _init_index(self):
> +		cp_map = {}
> +		desc_cache = {}
> +		for repo_path in self._portdb.porttrees:
> +			outside_repo =
> os.path.join(self._portdb.depcachedir,
> +				repo_path.lstrip(os.sep))
> +			for parent_dir in (repo_path, outside_repo):
> +				file_path = os.path.join(parent_dir,
> +					"metadata", "pkg_desc_index")
> +
> +				try:
> +					with io.open(file_path,
> +
> encoding=_encodings["repo.content"]) as f:
> +						for line in f:
> +							try:
> +
> pkgs, desc = line.split(":", 1)
> +							except
> ValueError:
> +
> continue
> +							desc =
> desc.strip()
> +							try:
> +								cp,
> pkgs = pkgs.split(" ", 1)
> +							except
> ValueError:
> +
> continue
> +							if not cp:
> +
> continue
> +							try:
> +								atom
> = Atom(cp)
> +							except
> InvalidAtom:
> +
> continue
> +							if cp !=
> atom.cp:
> +
> continue
> +							cp_list =
> cp_map.get(cp)
> +							if cp_list
> is None:
> +
> cp_list = []
> +
> cp_map[cp] = cp_list
> +							for ver in
> pkgs.split():
> +								try:
> +
> cpv = _pkg_str(cp + "-" + ver)
> +
> except InvalidData:
> +
> pass
> +								else:
> +
> cp_list.append(cpv)
> +
> desc_cache[cpv] = desc
> +				except IOError:
> +					pass
> +				else:
> +					break
> +			else:
> +				# No descriptions index was found,
> so populate
> +				# cp_map the slow way.
> +				for cp in
> self._portdb.cp_all(trees=[repo_path]):
> +					cp_list = cp_map.get(cp)
> +					if cp_list is None:
> +						cp_list = []
> +						cp_map[cp] = cp_list
> +					for cpv in
> self._portdb.cp_list(cp, mytree=repo_path):
> +						if cpv not in
> cp_list:
> +
> cp_list.append(_pkg_str(cpv)) +
> +		self._desc_cache = desc_cache
> +		self._cp_map = cp_map
> +
> +	def cp_all(self):
> +		if self._cp_map is None:
> +			self._init_index()
> +		return list(self._cp_map)
> +
> +	def match(self, atom):
> +		if not isinstance(atom, Atom):
> +			atom = Atom(atom)
> +		cp_list = self._cp_map.get(atom.cp)
> +		if cp_list is None:
> +			return []
> +		self._portdb._cpv_sort_ascending(cp_list)
> +		return portage.match_from_list(atom, cp_list)
> +
> +	def aux_get(self, cpv, attrs, myrepo = None):
> +		if len(attrs) == 1 and attrs[0] == "DESCRIPTION":
> +			try:
> +				return [self._desc_cache[cpv]]
> +			except KeyError:
> +				pass
> +		return self._portdb.aux_get(cpv, attrs)
> +
> +
> +class IndexedVardb(object):
> +	"""
> +	A vardbapi interface that sacrifices validation in order to
> +	improve performance. It takes advantage of
> vardbdbapi._aux_cache,
> +	which is backed by vdb_metadata.pickle. Since _aux_cache is
> +	not updated for every single merge/unmerge (see
> +	_aux_cache_threshold), the list of packages is obtained
> directly
> +	from the real vardbapi instance. If a package is missing from
> +	_aux_cache, then its metadata is obtained using the normal
> +	(validated) vardbapi.aux_get method.
> +	"""
> +	def __init__(self, vardb):
> +		self._vardb = vardb
> +		self._aux_cache_keys = vardb._aux_cache_keys
> +		self._cpv_sort_ascending = vardb._cpv_sort_ascending
> +		self._cp_map = {}
> +		self.cpv_exists = vardb.cpv_exists
> +
> +	def cp_all(self):
> +		if self._cp_map:
> +			return list(self._cp_map)
> +		cp_map = self._cp_map
> +		for cpv in self._vardb.cpv_all():
> +			cp = portage.cpv_getkey(cpv)
> +			if cp is not None:
> +				cp_list = cp_map.get(cp)
> +				if cp_list is None:
> +					cp_list = []
> +					cp_map[cp] = cp_list
> +				cp_list.append(_pkg_str(cpv))
> +		return list(cp_map)
> +
> +	def match(self, atom):
> +		if not isinstance(atom, Atom):
> +			atom = Atom(atom)
> +		cp_list = self._cp_map.get(atom.cp)
> +		if cp_list is None:
> +			return []
> +		self._vardb._cpv_sort_ascending(cp_list)
> +		return portage.match_from_list(atom, cp_list)
> +
> +	def aux_get(self, cpv, attrs, myrepo = None):
> +		pkg_data =
> self._vardb._aux_cache["packages"].get(cpv)
> +		if not isinstance(pkg_data, tuple) or \
> +			len(pkg_data) != 2 or \
> +			not isinstance(pkg_data[1], dict):
> +			pkg_data = None
> +		if pkg_data is None:
> +			# It may be missing from _aux_cache due to
> +			# _aux_cache_threshold.
> +			return self._vardb.aux_get(cpv, attrs)
> +		metadata = pkg_data[1]
> +		return [metadata.get(k, "") for k in attrs]


Otherwise it looks good.
-- 
Brian Dolbec <dolsen>



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [gentoo-portage-dev] Re: [PATCH] emerge --search: use description index
  2014-10-23  8:55     ` Brian Dolbec
@ 2014-10-23  9:22       ` Zac Medico
  2014-11-01  6:15         ` Zac Medico
  0 siblings, 1 reply; 29+ messages in thread
From: Zac Medico @ 2014-10-23  9:22 UTC (permalink / raw
  To: gentoo-portage-dev

On 10/23/2014 01:55 AM, Brian Dolbec wrote:
> What I wonder, is why the following two classes aren't in the portage
> namespace.  There is far too much logic embedded in the _emerge
> namespace.  Most probably under the portage/dpapi subpkg. Looking at
> them, they do look very similar to the portdbapi and vardbapi classes.
> They are just stripped down and optimised for this data.  They also
> don't seem to use any _emerge specific namespace modules that I saw.
> 
> Perhaps with a file name of index.py or indexers.py

The classes aren't really designed for general use, so that's why I left
them bundled in _emerge.search. However, the classes could certainly be
fleshed out for general-purpose use.

In their current state, the indexer classes implement only the minimal
functionality required by the search class. A drawback of the current
IndexedPortdb design is that it holds all of the index data in memory at
once. I took this approach because that allowed it to easily fit the
dbapi interface used by the search class.

In the future, we might decide to rewrite the search class so that it
processes the index as a stream, which will allow individual search
results to be displayed as soon as they are located [1]. This rewrite
will require a new index API.

So, considering that we probably want a new index API in the future, we
might save some unnecessary effort and stick with the
special-purpose/minimalistic IndexedPortdb/Vardb adapters for now.

[1] https://bugs.gentoo.org/show_bug.cgi?id=412471
-- 
Thanks,
Zac

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [gentoo-portage-dev] Re: [PATCH] emerge --search: use description index
  2014-10-23  9:22       ` Zac Medico
@ 2014-11-01  6:15         ` Zac Medico
  0 siblings, 0 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-01  6:15 UTC (permalink / raw
  To: gentoo-portage-dev

On 10/23/2014 02:22 AM, Zac Medico wrote:
> In the future, we might decide to rewrite the search class so that it
> processes the index as a stream, which will allow individual search
> results to be displayed as soon as they are located [1]. This rewrite
> will require a new index API.
> 
> So, considering that we probably want a new index API in the future, we
> might save some unnecessary effort and stick with the
> special-purpose/minimalistic IndexedPortdb/Vardb adapters for now.
> 
> [1] https://bugs.gentoo.org/show_bug.cgi?id=412471

I have a stream-based back end working in this branch:

  https://github.com/zmedico/portage/commits/bug_525718_stream

Now all that's left to do is to convert IndexedPortdb to expose an
iter_cp_all method, and to fix the search class to iterate over
iter_cp_all and display results incrementally. Once that's done, I'll
think about exposing useful pieces as public APIs.
-- 
Thanks,
Zac


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev]
  2014-10-18  3:28 [gentoo-portage-dev] [PATCH] emerge --search: use description index Zac Medico
  2014-10-18  5:59 ` [gentoo-portage-dev] " Zac Medico
@ 2014-11-01 22:46 ` Zac Medico
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 1/5] Add egencache --update-pkg-desc-index action Zac Medico
                     ` (5 more replies)
  1 sibling, 6 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-01 22:46 UTC (permalink / raw
  To: gentoo-portage-dev

In addition to indexed search, this patch series implements
stream-based incremental display of search results (fixing bug
#412471).

I'll be maintaining this patch series in the following branch:

	https://github.com/zmedico/portage/tree/bug_525718_stream

The old non-incremental version of this patch series is still
available here:

	https://github.com/zmedico/portage/tree/bug_525718



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 1/5] Add egencache --update-pkg-desc-index action.
  2014-11-01 22:46 ` [gentoo-portage-dev] Zac Medico
@ 2014-11-01 22:46   ` Zac Medico
  2014-11-04  9:03     ` [gentoo-portage-dev] [PATCH 1/5 v2] " Zac Medico
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 2/5] Add IndexStreamIterator and MultiIterGroupBy Zac Medico
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 29+ messages in thread
From: Zac Medico @ 2014-11-01 22:46 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

This adds an egencache --update-pkg-desc-index action which generates
a plain-text index of package names, versions, and descriptions. The
index can then be used to optimize emerge --search / --searchdesc
actions.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
 bin/egencache                             | 38 ++++++++++++++++++++++--
 man/egencache.1                           |  4 +++
 man/portage.5                             | 12 ++++++++
 pym/portage/cache/index/__init__.py       |  2 ++
 pym/portage/cache/index/pkg_desc_index.py | 48 +++++++++++++++++++++++++++++++
 5 files changed, 102 insertions(+), 2 deletions(-)
 create mode 100644 pym/portage/cache/index/__init__.py
 create mode 100644 pym/portage/cache/index/pkg_desc_index.py

diff --git a/bin/egencache b/bin/egencache
index e366058..f97432f 100755
--- a/bin/egencache
+++ b/bin/egencache
@@ -48,6 +48,7 @@ portage._internal_caller = True
 from portage import os, _encodings, _unicode_encode, _unicode_decode
 from _emerge.MetadataRegen import MetadataRegen
 from portage.cache.cache_errors import CacheError, StatCollision
+from portage.cache.index.pkg_desc_index import pkg_desc_index_line_format
 from portage.const import TIMESTAMP_FORMAT
 from portage.manifest import guessManifestFileType
 from portage.package.ebuild._parallel_manifest.ManifestScheduler import ManifestScheduler
@@ -57,7 +58,7 @@ from portage.util._async.run_main_scheduler import run_main_scheduler
 from portage.util._eventloop.global_event_loop import global_event_loop
 from portage import cpv_getkey
 from portage.dep import Atom, isjustname
-from portage.versions import pkgsplit, vercmp
+from portage.versions import pkgsplit, vercmp, _pkg_str
 
 try:
 	from xml.etree import ElementTree
@@ -91,6 +92,9 @@ def parse_args(args):
 	actions.add_argument("--update-changelogs",
 		action="store_true",
 		help="update the ChangeLog files from SCM logs")
+	actions.add_argument("--update-pkg-desc-index",
+		action="store_true",
+		help="update package description index")
 	actions.add_argument("--update-manifests",
 		action="store_true",
 		help="update manifests")
@@ -451,6 +455,29 @@ class GenCache(object):
 		if hasattr(trg_cache, '_prune_empty_dirs'):
 			trg_cache._prune_empty_dirs()
 
+class GenPkgDescIndex(object):
+	def __init__(self, portdb, output_file):
+		self.returncode = os.EX_OK
+		self._portdb = portdb
+		self._output_file = output_file
+
+	def run(self):
+
+		portage.util.ensure_dirs(os.path.dirname(self._output_file))
+		f = portage.util.atomic_ofstream(self._output_file,
+			encoding=_encodings["repo.content"])
+
+		portdb = self._portdb
+		for cp in portdb.cp_all():
+			pkgs = portdb.cp_list(cp)
+			if not pkgs:
+				continue
+			desc, = portdb.aux_get(pkgs[-1], ["DESCRIPTION"])
+
+			f.write(pkg_desc_index_line_format(cp, pkgs, desc))
+
+		f.close()
+
 class GenUseLocalDesc(object):
 	def __init__(self, portdb, output=None,
 			preserve_comments=False):
@@ -893,7 +920,8 @@ def egencache_main(args):
 			local_config=False, env=env)
 
 	if not (options.update or options.update_use_local_desc or
-			options.update_changelogs or options.update_manifests):
+			options.update_changelogs or options.update_manifests or
+			options.update_pkg_desc_index):
 		parser.error('No action specified')
 		return 1
 
@@ -1057,6 +1085,12 @@ def egencache_main(args):
 		else:
 			ret.append(scheduler.returncode)
 
+	if options.update_pkg_desc_index:
+		gen_index = GenPkgDescIndex(portdb, os.path.join(
+			repo_config.location, "metadata", "pkg_desc_index"))
+		gen_index.run()
+		ret.append(gen_index.returncode)
+
 	if options.update_use_local_desc:
 		gen_desc = GenUseLocalDesc(portdb,
 			output=options.uld_output,
diff --git a/man/egencache.1 b/man/egencache.1
index f71feb3..3a3197f 100644
--- a/man/egencache.1
+++ b/man/egencache.1
@@ -19,6 +19,10 @@ for the details on package atom syntax.
 .BR "\-\-update\-changelogs"
 Update the ChangeLog files from SCM logs (supported only in git repos).
 .TP
+.BR "\-\-update\-pkg\-desc\-index"
+Update the package description index which is located at
+\fImetadata/pkg_desc_index\fR in the repository.
+.TP
 .BR "\-\-update\-use\-local\-desc"
 Update the \fIprofiles/use.local.desc\fR file from metadata.xml.
 .TP
diff --git a/man/portage.5 b/man/portage.5
index 309e259..f2f5243 100644
--- a/man/portage.5
+++ b/man/portage.5
@@ -76,6 +76,7 @@ user\-defined package sets
 .BR /usr/portage/metadata/
 .nf
 layout.conf
+pkg_desc_index
 .fi
 .TP
 .BR /usr/portage/profiles/
@@ -1138,6 +1139,17 @@ cache\-formats = md5-dict pms
 profile\-formats = portage-2
 .fi
 .RE
+.TP
+.BR pkg_desc_index
+This is an index of package names, versions, and descriptions which
+may be generated by \fBegencache\fR(1) in order to optimize
+\fBemerge\fR(1) search actions.
+
+.I Example:
+.nf
+sys-apps/sed 4.2 4.2.1 4.2.1-r1 4.2.2: Super-useful stream editor
+sys-apps/usleep 0.1: A wrapper for usleep
+.fi
 .RE
 .TP
 .BR /usr/portage/profiles/
diff --git a/pym/portage/cache/index/__init__.py b/pym/portage/cache/index/__init__.py
new file mode 100644
index 0000000..7cd880e
--- /dev/null
+++ b/pym/portage/cache/index/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
diff --git a/pym/portage/cache/index/pkg_desc_index.py b/pym/portage/cache/index/pkg_desc_index.py
new file mode 100644
index 0000000..7a05984
--- /dev/null
+++ b/pym/portage/cache/index/pkg_desc_index.py
@@ -0,0 +1,48 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+import collections
+
+from portage.dep import Atom
+from portage.exception import InvalidAtom, InvalidData
+from portage.versions import _pkg_str
+
+pkg_desc_index_node = collections.namedtuple("pkg_desc_index_node",
+	["cp", "cpv_list", "desc"])
+
+def pkg_desc_index_line_format(cp, pkgs, desc):
+	return "%s %s: %s\n" % (cp,
+		" ".join(_pkg_str(cpv).version
+		for cpv in pkgs), desc)
+
+def pkg_desc_index_line_read(line, repo = None):
+
+	try:
+		pkgs, desc = line.split(":", 1)
+	except ValueError:
+		return None
+	desc = desc.strip()
+
+	try:
+		cp, pkgs = pkgs.split(" ", 1)
+	except ValueError:
+		return None
+
+	try:
+		atom = Atom(cp)
+	except InvalidAtom:
+		return None
+	if cp != atom.cp:
+		return None
+
+	cp_list = []
+	for ver in pkgs.split():
+		try:
+			cpv = _pkg_str(
+				cp + "-" + ver, repo = repo)
+		except InvalidData:
+			pass
+		else:
+			cp_list.append(cpv)
+
+	return pkg_desc_index_node(cp, tuple(cp_list), desc)
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 2/5] Add IndexStreamIterator and MultiIterGroupBy.
  2014-11-01 22:46 ` [gentoo-portage-dev] Zac Medico
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 1/5] Add egencache --update-pkg-desc-index action Zac Medico
@ 2014-11-01 22:46   ` Zac Medico
  2014-11-02  0:18     ` Zac Medico
                       ` (2 more replies)
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 3/5] Add IndexedPortdb class Zac Medico
                     ` (3 subsequent siblings)
  5 siblings, 3 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-01 22:46 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

This IndexStreamIterator class can be used together with the
pkg_desc_index_line_read function to read and index file incrementally
as a stream.

The MultiIterGroupBy class can be used to iterate over multiple
IndexStreamIterator instances at once, incrementally grouping results
for a particular package from multiple indices, while limiting the
amount of any given index that must be in memory at once.

Both of these classes are used by the  IndexedPortdb class in the next
patch of this series.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
 pym/portage/cache/index/IndexStreamIterator.py | 27 +++++++++
 pym/portage/util/iterators/MultiIterGroupBy.py | 82 ++++++++++++++++++++++++++
 pym/portage/util/iterators/__init__.py         |  2 +
 3 files changed, 111 insertions(+)
 create mode 100644 pym/portage/cache/index/IndexStreamIterator.py
 create mode 100644 pym/portage/util/iterators/MultiIterGroupBy.py
 create mode 100644 pym/portage/util/iterators/__init__.py

diff --git a/pym/portage/cache/index/IndexStreamIterator.py b/pym/portage/cache/index/IndexStreamIterator.py
new file mode 100644
index 0000000..972aee1
--- /dev/null
+++ b/pym/portage/cache/index/IndexStreamIterator.py
@@ -0,0 +1,27 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+class IndexStreamIterator(object):
+
+	def __init__(self, f, parser):
+
+		self.parser = parser
+		self._file = f
+
+	def close(self):
+
+		if self._file is not None:
+			self._file.close()
+			self._file = None
+
+	def __iter__(self):
+
+		try:
+
+			for line in self._file:
+				node = self.parser(line)
+				if node is not None:
+					yield node
+
+		finally:
+			self.close()
diff --git a/pym/portage/util/iterators/MultiIterGroupBy.py b/pym/portage/util/iterators/MultiIterGroupBy.py
new file mode 100644
index 0000000..d4e62ad
--- /dev/null
+++ b/pym/portage/util/iterators/MultiIterGroupBy.py
@@ -0,0 +1,82 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+class MultiIterGroupBy(object):
+	"""
+	This class functions similarly to the itertools.groupby function,
+	except that it takes multiple source iterators as input. The source
+	iterators must yield objects in sorted order. A group is yielded as
+	soon as the progress of all iterators reaches a state which
+	guarantees that there can not be any remaining (unseen) elements of
+	the group. This is useful for incremental display of grouped search
+	results.
+	"""
+
+	def __init__(self, iterators, key = None):
+		self._iterators = iterators
+		self._key = key
+
+	def __iter__(self):
+
+		progress = []
+		iterators = self._iterators[:]
+		for index in iterators:
+			progress.append(None)
+
+		key_map = {}
+		eof = []
+		key_getter = self._key
+		if key_getter is None:
+			key_getter = lambda x: x
+		max_progress = None
+
+		while iterators:
+			min_progress = None
+			for i, index in enumerate(iterators):
+
+				if max_progress is not None and \
+					max_progress == progress[i] and \
+					min_progress is not None and \
+					max_progress != min_progress:
+					# This one has the most progress,
+					# so allow the others to catch up.
+					continue
+
+				for entry in index:
+					progress[i] = key_getter(entry)
+					key_group = key_map.get(key_getter(entry))
+					if key_group is None:
+						key_group = []
+						key_map[key_getter(entry)] = key_group
+
+					key_group.append(entry)
+
+					if min_progress is None or \
+						key_getter(entry) < min_progress:
+						min_progress = key_getter(entry)
+
+					if max_progress is None or \
+						key_getter(entry) >= max_progress:
+						max_progress = key_getter(entry)
+						# This one has the most progress,
+						# so allow the others to catch up.
+						break
+
+				else:
+					eof.append(i)
+
+			if eof:
+				for i in reversed(eof):
+					del iterators[i]
+					del progress[i]
+				del eof[:]
+
+			yield_these = []
+			for k in key_map:
+				if k <= min_progress:
+					yield_these.append(k)
+
+			if yield_these:
+				yield_these.sort()
+				for k in yield_these:
+					yield key_map.pop(k)
diff --git a/pym/portage/util/iterators/__init__.py b/pym/portage/util/iterators/__init__.py
new file mode 100644
index 0000000..7cd880e
--- /dev/null
+++ b/pym/portage/util/iterators/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 3/5] Add IndexedPortdb class.
  2014-11-01 22:46 ` [gentoo-portage-dev] Zac Medico
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 1/5] Add egencache --update-pkg-desc-index action Zac Medico
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 2/5] Add IndexStreamIterator and MultiIterGroupBy Zac Medico
@ 2014-11-01 22:46   ` Zac Medico
  2014-11-04  5:07     ` [gentoo-portage-dev] [PATCH 3/5 v2] " Zac Medico
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 4/5] Add IndexedVardb class Zac Medico
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 29+ messages in thread
From: Zac Medico @ 2014-11-01 22:46 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

The IndexedPortdb class uses pkg_desc_index to optimize searchs for
package names and descriptions. If the package description index is
missing from a particular repository, then all metadata for that
repository is obtained using the normal pordbapi.aux_get method.

This class only implements a subset of portdbapi functionality that is
useful for searching pkg_desc_index incrementally. For this reason,
the cp_all method returns an ordered iterator instead of a list, so
that search results can be displayed incrementally.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
 pym/portage/dbapi/IndexedPortdb.py | 151 +++++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 pym/portage/dbapi/IndexedPortdb.py

diff --git a/pym/portage/dbapi/IndexedPortdb.py b/pym/portage/dbapi/IndexedPortdb.py
new file mode 100644
index 0000000..4fb2cf1
--- /dev/null
+++ b/pym/portage/dbapi/IndexedPortdb.py
@@ -0,0 +1,151 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+import errno
+import io
+import functools
+import operator
+import os
+
+import portage
+from portage import _encodings
+from portage.dep import Atom
+from portage.exception import FileNotFound
+from portage.cache.index.IndexStreamIterator import IndexStreamIterator
+from portage.cache.index.pkg_desc_index import pkg_desc_index_line_read
+from portage.util.iterators.MultiIterGroupBy import MultiIterGroupBy
+from portage.versions import _pkg_str
+
+class IndexedPortdb(object):
+	"""
+	A portdbapi interface that uses a package description index to
+	improve performance. If the description index is missing for a
+	particular repository, then all metadata for that repository is
+	obtained using the normal pordbapi.aux_get method.
+
+	For performance reasons, the match method only supports package
+	name and version constraints. For the same reason, the xmatch
+	method is not implemented.
+	"""
+
+	_copy_attrs = ('cpv_exists', 'findname', 'getFetchMap',
+		'_aux_cache_keys', '_cpv_sort_ascending',
+		'_have_root_eclass_dir')
+
+	def __init__(self, portdb):
+
+		self._portdb = portdb
+
+		for k in self._copy_attrs:
+			setattr(self, k, getattr(portdb, k))
+
+		self._desc_cache = None
+		self._cp_map = None
+
+	def _init_index(self):
+
+		cp_map = {}
+		desc_cache = {}
+		self._desc_cache = desc_cache
+		self._cp_map = cp_map
+
+		streams = []
+		for repo_path in self._portdb.porttrees:
+			outside_repo = os.path.join(self._portdb.depcachedir,
+				repo_path.lstrip(os.sep))
+			filenames = []
+			for parent_dir in (repo_path, outside_repo):
+				filenames.append(os.path.join(parent_dir,
+					"metadata", "pkg_desc_index"))
+
+			repo_name = self._portdb.getRepositoryName(repo_path)
+
+			try:
+				f = None
+				for filename in filenames:
+					try:
+						f = io.open(filename,
+							encoding=_encodings["repo.content"])
+					except IOError as e:
+						if e.errno not in (errno.ENOENT, errno.ESTALE):
+							raise
+					else:
+						break
+
+				if f is None:
+					raise FileNotFound(filename)
+
+				streams.append(iter(IndexStreamIterator(f,
+					functools.partial(pkg_desc_index_line_read,
+					repo = repo_name))))
+			except FileNotFound:
+
+				# No descriptions index was found, so populate
+				# cp_map the slow way.
+				for cp in self._portdb.cp_all(trees=[repo_path]):
+
+					cp_list = cp_map.get(cp)
+					if cp_list is None:
+						cp_list = []
+						cp_map[cp] = cp_list
+					for cpv in self._portdb.cp_list(
+						cp, mytree = repo_path):
+						cp_list.append(_pkg_str(cpv, repo = repo_name))
+
+		# Create a sorted queue that will be merged with the
+		# sorted/grouped results from MultiIterGroupBy as they
+		# become available.
+		yield_queue = sorted(cp_map, reverse = True)
+
+		for cp_group in MultiIterGroupBy(streams,
+			key = operator.attrgetter("cp")):
+
+			new_cp = None
+			cp_list = cp_map.get(cp_group[0].cp)
+			if cp_list is None:
+				new_cp = cp_group[0].cp
+				cp_list = []
+				cp_map[cp_group[0].cp] = cp_list
+
+			for entry in cp_group:
+				cp_list.extend(entry.cpv_list)
+				for cpv in entry.cpv_list:
+					desc_cache[cpv] = entry.desc
+
+			if new_cp is not None:
+				while yield_queue and yield_queue[-1] < new_cp:
+					yield yield_queue.pop()
+				yield cp_group[0].cp
+
+		while yield_queue:
+			yield yield_queue.pop()
+
+	def cp_all(self):
+		"""
+		Returns an ordered iterator instead of a list, so that search
+		results can be displayed incrementally.
+		"""
+		if self._cp_map is None:
+			return self._init_index()
+		return iter(sorted(self._cp_map))
+
+	def match(self, atom):
+		"""
+		For performance reasons, only package name and version
+		constraints are supported.
+		"""
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+		self._portdb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		if len(attrs) == 1 and attrs[0] == "DESCRIPTION":
+			try:
+				return [self._desc_cache[cpv]]
+			except KeyError:
+				pass
+		return self._portdb.aux_get(cpv, attrs)
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 4/5] Add IndexedVardb class.
  2014-11-01 22:46 ` [gentoo-portage-dev] Zac Medico
                     ` (2 preceding siblings ...)
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 3/5] Add IndexedPortdb class Zac Medico
@ 2014-11-01 22:46   ` Zac Medico
  2014-11-05  9:59     ` [gentoo-portage-dev] " Zac Medico
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 5/5] Add emerge --search-index option Zac Medico
  2014-11-03 21:42   ` [gentoo-portage-dev] Brian Dolbec
  5 siblings, 1 reply; 29+ messages in thread
From: Zac Medico @ 2014-11-01 22:46 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

Searching of installed packages is optimized to take advantage of
vardbdbapi._aux_cache, which is backed by vdb_metadata.pickle.
This class only implements a subset of vardbapi functionality that is
useful for searching incrementally. For this reason, the cp_all method
returns an ordered iterator instead of a list, so that search results
can be displayed incrementally.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
 pym/portage/dbapi/IndexedVardb.py | 87 +++++++++++++++++++++++++++++++++++++++
 pym/portage/dbapi/vartree.py      | 19 +++++++--
 2 files changed, 102 insertions(+), 4 deletions(-)
 create mode 100644 pym/portage/dbapi/IndexedVardb.py

diff --git a/pym/portage/dbapi/IndexedVardb.py b/pym/portage/dbapi/IndexedVardb.py
new file mode 100644
index 0000000..b2d894b
--- /dev/null
+++ b/pym/portage/dbapi/IndexedVardb.py
@@ -0,0 +1,87 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+import portage
+from portage.dep import Atom
+from portage.versions import _pkg_str
+
+class IndexedVardb(object):
+	"""
+	A vardbapi interface that sacrifices validation in order to
+	improve performance. It takes advantage of vardbdbapi._aux_cache,
+	which is backed by vdb_metadata.pickle. Since _aux_cache is
+	not updated for every single merge/unmerge (see
+	_aux_cache_threshold), the list of packages is obtained directly
+	from the real vardbapi instance. If a package is missing from
+	_aux_cache, then its metadata is obtained using the normal
+	(validated) vardbapi.aux_get method.
+
+	For performance reasons, the match method only supports package
+	name and version constraints.
+	"""
+
+	_copy_attrs = ('cpv_exists',
+		'_aux_cache_keys', '_cpv_sort_ascending')
+
+	def __init__(self, vardb):
+		self._vardb = vardb
+
+		for k in self._copy_attrs:
+			setattr(self, k, getattr(vardb, k))
+
+		self._cp_map = None
+
+	def cp_all(self):
+		"""
+		Returns an ordered iterator instead of a list, so that search
+		results can be displayed incrementally.
+		"""
+		if self._cp_map is not None:
+			return iter(sorted(self._cp_map))
+
+		return self._iter_cp_all()
+
+	def _iter_cp_all(self):
+		self._cp_map = cp_map = {}
+		previous_cp = None
+		for cpv in self._vardb._iter_cpv_all(sort = True):
+			cp = portage.cpv_getkey(cpv)
+			if cp is not None:
+				cp_list = cp_map.get(cp)
+				if cp_list is None:
+					cp_list = []
+					cp_map[cp] = cp_list
+				cp_list.append(_pkg_str(cpv))
+				if previous_cp is not None and \
+					previous_cp != cp:
+					yield previous_cp
+				previous_cp = cp
+
+		if previous_cp is not None:
+			yield previous_cp
+
+	def match(self, atom):
+		"""
+		For performance reasons, only package name and version
+		constraints are supported.
+		"""
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+		self._vardb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		pkg_data = self._vardb._aux_cache["packages"].get(cpv)
+		if not isinstance(pkg_data, tuple) or \
+			len(pkg_data) != 2 or \
+			not isinstance(pkg_data[1], dict):
+			pkg_data = None
+		if pkg_data is None:
+			# It may be missing from _aux_cache due to
+			# _aux_cache_threshold.
+			return self._vardb.aux_get(cpv, attrs)
+		metadata = pkg_data[1]
+		return [metadata.get(k, "") for k in attrs]
diff --git a/pym/portage/dbapi/vartree.py b/pym/portage/dbapi/vartree.py
index e21135a..37504e8 100644
--- a/pym/portage/dbapi/vartree.py
+++ b/pym/portage/dbapi/vartree.py
@@ -422,6 +422,9 @@ class vardbapi(dbapi):
 		(generally this is only necessary in critical sections that
 		involve merge or unmerge of packages).
 		"""
+		return list(self._iter_cpv_all(use_cache=use_cache))
+
+	def _iter_cpv_all(self, use_cache = True, sort = False):
 		returnme = []
 		basepath = os.path.join(self._eroot, VDB_PATH) + os.path.sep
 
@@ -438,12 +441,21 @@ class vardbapi(dbapi):
 					del e
 					return []
 
-		for x in listdir(basepath, EmptyOnError=1, ignorecvs=1, dirsonly=1):
+		catdirs = listdir(basepath, EmptyOnError=1, ignorecvs=1, dirsonly=1)
+		if sort:
+			catdirs.sort()
+
+		for x in catdirs:
 			if self._excluded_dirs.match(x) is not None:
 				continue
 			if not self._category_re.match(x):
 				continue
-			for y in listdir(basepath + x, EmptyOnError=1, dirsonly=1):
+
+			pkgdirs = listdir(basepath + x, EmptyOnError=1, dirsonly=1)
+			if sort:
+				pkgdirs.sort()
+
+			for y in pkgdirs:
 				if self._excluded_dirs.match(y) is not None:
 					continue
 				subpath = x + "/" + y
@@ -455,9 +467,8 @@ class vardbapi(dbapi):
 				except InvalidData:
 					self.invalidentry(self.getpath(subpath))
 					continue
-				returnme.append(subpath)
 
-		return returnme
+				yield subpath
 
 	def cp_all(self, use_cache=1):
 		mylist = self.cpv_all(use_cache=use_cache)
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 5/5] Add emerge --search-index option.
  2014-11-01 22:46 ` [gentoo-portage-dev] Zac Medico
                     ` (3 preceding siblings ...)
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 4/5] Add IndexedVardb class Zac Medico
@ 2014-11-01 22:46   ` Zac Medico
  2014-11-01 23:04     ` Zac Medico
  2014-11-04 22:09     ` [gentoo-portage-dev] [PATCH 5/5 v4] " Zac Medico
  2014-11-03 21:42   ` [gentoo-portage-dev] Brian Dolbec
  5 siblings, 2 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-01 22:46 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

The new emerge --search-index option, which is enabled by default,
causes pkg_desc_index to be used for search optimization. The search
index needs to be regenerated by egencache after changes are made to
a repository (see the --update-pkg-desc-index action).

For users that would like to modify ebuilds in a repository without
running egencache afterwards, emerge --search-index=n can be used to
get non-indexed search. Alternatively, the user could simply remove
the stale index file, in order to disable the search index for a
particular repository.

In order to conserve memory, indices are read as streams, and
MultiIterGroupBy is used to group results from IndexedPortdb and
IndexedVardb. Stream-oriented search also makes it possible to
display search results incrementally (fixing bug #412471).

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
 man/emerge.1            |   8 ++++
 pym/_emerge/actions.py  |   3 +-
 pym/_emerge/depgraph.py |   2 +-
 pym/_emerge/main.py     |   5 +++
 pym/_emerge/search.py   | 112 ++++++++++++++++++++++++++++++++++--------------
 5 files changed, 95 insertions(+), 35 deletions(-)

diff --git a/man/emerge.1 b/man/emerge.1
index bbe71ac..7bcdd9a 100644
--- a/man/emerge.1
+++ b/man/emerge.1
@@ -796,6 +796,14 @@ If ebuilds using EAPIs which \fIdo not\fR support \fBHDEPEND\fR are built in
 the same \fBemerge\fR run as those using EAPIs which \fIdo\fR support
 \fBHDEPEND\fR, this option affects only the former.
 .TP
+.BR "\-\-search\-index < y | n >"
+Enable or disable indexed search for search actions. This option is
+enabled by default. The search index needs to be regenerated by
+\fBegencache\fR(1) after changes are made to a repository (see the
+\fB\-\-update\-pkg\-desc\-index\fR action). This setting can be added
+to \fBEMERGE_DEFAULT_OPTS\fR (see \fBmake.conf\fR(5)) and later
+overridden via the command line.
+.TP
 .BR "\-\-select [ y | n ] (\-w short option)"
 Add specified packages to the world set (inverse of
 \fB\-\-oneshot\fR). This is useful if you want to
diff --git a/pym/_emerge/actions.py b/pym/_emerge/actions.py
index 48b0826..8a22ab5 100644
--- a/pym/_emerge/actions.py
+++ b/pym/_emerge/actions.py
@@ -2015,7 +2015,8 @@ def action_search(root_config, myopts, myfiles, spinner):
 		searchinstance = search(root_config,
 			spinner, "--searchdesc" in myopts,
 			"--quiet" not in myopts, "--usepkg" in myopts,
-			"--usepkgonly" in myopts)
+			"--usepkgonly" in myopts,
+			search_index = myopts.get("--search-index", "y") != "n")
 		for mysearch in myfiles:
 			try:
 				searchinstance.execute(mysearch)
diff --git a/pym/_emerge/depgraph.py b/pym/_emerge/depgraph.py
index 78b9236..2fbb7ce 100644
--- a/pym/_emerge/depgraph.py
+++ b/pym/_emerge/depgraph.py
@@ -8596,7 +8596,7 @@ def ambiguous_package_name(arg, atoms, root_config, spinner, myopts):
 
 	s = search(root_config, spinner, "--searchdesc" in myopts,
 		"--quiet" not in myopts, "--usepkg" in myopts,
-		"--usepkgonly" in myopts)
+		"--usepkgonly" in myopts, search_index = False)
 	null_cp = portage.dep_getkey(insert_category_into_atom(
 		arg, "null"))
 	cat, atom_pn = portage.catsplit(null_cp)
diff --git a/pym/_emerge/main.py b/pym/_emerge/main.py
index cf7966c..c08e12a 100644
--- a/pym/_emerge/main.py
+++ b/pym/_emerge/main.py
@@ -616,6 +616,11 @@ def parse_opts(tmpcmdline, silent=False):
 			"choices" :("True", "rdeps")
 		},
 
+		"--search-index": {
+			"help": "Enable or disable indexed search (enabled by default)",
+			"choices": y_or_n
+		},
+
 		"--select": {
 			"shortopt" : "-w",
 			"help"    : "add specified packages to the world set " + \
diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py
index 4b0fd9f..acde3bd 100644
--- a/pym/_emerge/search.py
+++ b/pym/_emerge/search.py
@@ -7,9 +7,12 @@ import re
 import portage
 from portage import os
 from portage.dbapi.porttree import _parse_uri_map
+from portage.dbapi.IndexedPortdb import IndexedPortdb
+from portage.dbapi.IndexedVardb import IndexedVardb
 from portage.localization import localized_size
 from portage.output import  bold, bold as white, darkgreen, green, red
 from portage.util import writemsg_stdout
+from portage.util.iterators.MultiIterGroupBy import MultiIterGroupBy
 
 from _emerge.Package import Package
 
@@ -25,15 +28,17 @@ class search(object):
 	# public interface
 	#
 	def __init__(self, root_config, spinner, searchdesc,
-		verbose, usepkg, usepkgonly):
+		verbose, usepkg, usepkgonly, search_index = True):
 		"""Searches the available and installed packages for the supplied search key.
 		The list of available and installed packages is created at object instantiation.
 		This makes successive searches faster."""
 		self.settings = root_config.settings
-		self.vartree = root_config.trees["vartree"]
-		self.spinner = spinner
 		self.verbose = verbose
 		self.searchdesc = searchdesc
+		self.searchkey = None
+		# Disable the spinner since search results are displayed
+		# incrementally.
+		self.spinner = None
 		self.root_config = root_config
 		self.setconfig = root_config.setconfig
 		self.matches = {"pkg" : []}
@@ -45,6 +50,10 @@ class search(object):
 		bindb = root_config.trees["bintree"].dbapi
 		vardb = root_config.trees["vartree"].dbapi
 
+		if search_index:
+			portdb = IndexedPortdb(portdb)
+			vardb = IndexedVardb(vardb)
+
 		if not usepkgonly and portdb._have_root_eclass_dir:
 			self._dbs.append(portdb)
 
@@ -53,16 +62,23 @@ class search(object):
 
 		self._dbs.append(vardb)
 		self._portdb = portdb
+		self._vardb = vardb
 
 	def _spinner_update(self):
 		if self.spinner:
 			self.spinner.update()
 
 	def _cp_all(self):
-		cp_all = set()
+		iterators = []
 		for db in self._dbs:
-			cp_all.update(db.cp_all())
-		return list(sorted(cp_all))
+			i = db.cp_all()
+			try:
+				i = iter(i)
+			except TypeError:
+				pass
+			iterators.append(i)
+		for group in MultiIterGroupBy(iterators):
+			yield group[0]
 
 	def _aux_get(self, *args, **kwargs):
 		for db in self._dbs:
@@ -97,7 +113,7 @@ class search(object):
 		return {}
 
 	def _visible(self, db, cpv, metadata):
-		installed = db is self.vartree.dbapi
+		installed = db is self._vardb
 		built = installed or db is not self._portdb
 		pkg_type = "ebuild"
 		if installed:
@@ -171,8 +187,11 @@ class search(object):
 
 	def execute(self,searchkey):
 		"""Performs the search for the supplied search key"""
+		self.searchkey = searchkey
+
+	def _iter_search(self):
+
 		match_category = 0
-		self.searchkey=searchkey
 		self.packagematches = []
 		if self.searchdesc:
 			self.searchdesc=1
@@ -181,6 +200,7 @@ class search(object):
 			self.searchdesc=0
 			self.matches = {"pkg":[], "set":[]}
 		print("Searching...   ", end=' ')
+		print()
 
 		regexsearch = False
 		if self.searchkey.startswith('%'):
@@ -206,8 +226,24 @@ class search(object):
 			if self.searchre.search(match_string):
 				if not self._xmatch("match-visible", package):
 					masked=1
-				self.matches["pkg"].append([package,masked])
+				yield ("pkg", package, masked)
 			elif self.searchdesc: # DESCRIPTION searching
+				# Check for DESCRIPTION match first, so that we can skip
+				# the expensive visiblity check if it doesn't match.
+				full_package = self._xmatch("match-all", package)
+				if not full_package:
+					continue
+				full_package = full_package[-1]
+				try:
+					full_desc = self._aux_get(
+						full_package, ["DESCRIPTION"])[0]
+				except KeyError:
+					portage.writemsg(
+						"emerge: search: aux_get() failed, skipping\n",
+						noiselevel=-1)
+					continue
+				if not self.searchre.search(full_desc):
+					continue
 				full_package = self._xmatch("bestmatch-visible", package)
 				if not full_package:
 					#no match found; we don't want to query description
@@ -217,14 +253,8 @@ class search(object):
 						continue
 					else:
 						masked=1
-				try:
-					full_desc = self._aux_get(
-						full_package, ["DESCRIPTION"])[0]
-				except KeyError:
-					print("emerge: search: aux_get() failed, skipping")
-					continue
-				if self.searchre.search(full_desc):
-					self.matches["desc"].append([full_package,masked])
+
+				yield ("desc", full_package, masked)
 
 		self.sdict = self.setconfig.getSets()
 		for setname in self.sdict:
@@ -235,16 +265,11 @@ class search(object):
 				match_string = setname.split("/")[-1]
 			
 			if self.searchre.search(match_string):
-				self.matches["set"].append([setname, False])
+				yield ("set", setname, False)
 			elif self.searchdesc:
 				if self.searchre.search(
 					self.sdict[setname].getMetadata("DESCRIPTION")):
-					self.matches["set"].append([setname, False])
-			
-		self.mlen=0
-		for mtype in self.matches:
-			self.matches[mtype].sort()
-			self.mlen += len(self.matches[mtype])
+					yield ("set", setname, False)
 
 	def addCP(self, cp):
 		if not self._xmatch("match-all", cp):
@@ -257,17 +282,32 @@ class search(object):
 
 	def output(self):
 		"""Outputs the results of the search."""
-		msg = []
+
+		class msg(object):
+			@staticmethod
+			def append(msg):
+				writemsg_stdout(msg, noiselevel=-1)
+
 		msg.append("\b\b  \n[ Results for search key : " + \
 			bold(self.searchkey) + " ]\n")
-		msg.append("[ Applications found : " + \
-			bold(str(self.mlen)) + " ]\n\n")
-		vardb = self.vartree.dbapi
+		vardb = self._vardb
 		metadata_keys = set(Package.metadata_keys)
 		metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"])
 		metadata_keys = tuple(metadata_keys)
-		for mtype in self.matches:
-			for match,masked in self.matches[mtype]:
+
+		if self.searchkey is None:
+			# Handle results added via addCP
+			addCP_matches = []
+			for mytype, (match, masked) in self.matches.items():
+				addCP_matches.append(mytype, match, masked)
+			iterator = iter(addCP_matches)
+
+		else:
+			# Do a normal search
+			iterator = self._iter_search()
+
+		for mtype, match, masked in iterator:
+				self.mlen += 1
 				full_package = None
 				if mtype == "pkg":
 					full_package = self._xmatch(
@@ -367,12 +407,19 @@ class search(object):
 							+ "   " + desc + "\n")
 						msg.append("      " + darkgreen("License:") + \
 							"       " + license + "\n\n")
-		writemsg_stdout(''.join(msg), noiselevel=-1)
+
+		msg.append("[ Applications found : " + \
+			bold(str(self.mlen)) + " ]\n\n")
+
 	#
 	# private interface
 	#
 	def getInstallationStatus(self,package):
-		installed_package = self.vartree.dep_bestmatch(package)
+		installed_package = self._vardb.match(package)
+		if installed_package:
+			installed_package = installed_package[-1]
+		else:
+			installed_package = ""
 		result = ""
 		version = self.getVersion(installed_package,search.VERSION_RELEASE)
 		if len(version) > 0:
@@ -391,4 +438,3 @@ class search(object):
 		else:
 			result = ""
 		return result
-
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 5/5] Add emerge --search-index option.
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 5/5] Add emerge --search-index option Zac Medico
@ 2014-11-01 23:04     ` Zac Medico
  2014-11-04  5:42       ` [gentoo-portage-dev] [PATCH 5/5 v3] " Zac Medico
  2014-11-04 22:09     ` [gentoo-portage-dev] [PATCH 5/5 v4] " Zac Medico
  1 sibling, 1 reply; 29+ messages in thread
From: Zac Medico @ 2014-11-01 23:04 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

The new emerge --search-index option, which is enabled by default,
causes pkg_desc_index to be used for search optimization. The search
index needs to be regenerated by egencache after changes are made to
a repository (see the --update-pkg-desc-index action).

For users that would like to modify ebuilds in a repository without
running egencache afterwards, emerge --search-index=n can be used to
get non-indexed search. Alternatively, the user could simply remove
the stale index file, in order to disable the search index for a
particular repository.

In order to conserve memory, indices are read as streams, and
MultiIterGroupBy is used to group results from IndexedPortdb and
IndexedVardb. Stream-oriented search also makes it possible to
display search results incrementally (fixing bug #412471).

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
This updated patch fixes the search.output method to reset the match
count after each search is performed, in case there are multiple
searches.

 man/emerge.1            |   8 ++++
 pym/_emerge/actions.py  |   3 +-
 pym/_emerge/depgraph.py |   2 +-
 pym/_emerge/main.py     |   5 ++
 pym/_emerge/search.py   | 119 ++++++++++++++++++++++++++++++++++--------------
 5 files changed, 102 insertions(+), 35 deletions(-)

diff --git a/man/emerge.1 b/man/emerge.1
index bbe71ac..7bcdd9a 100644
--- a/man/emerge.1
+++ b/man/emerge.1
@@ -796,6 +796,14 @@ If ebuilds using EAPIs which \fIdo not\fR support \fBHDEPEND\fR are built in
 the same \fBemerge\fR run as those using EAPIs which \fIdo\fR support
 \fBHDEPEND\fR, this option affects only the former.
 .TP
+.BR "\-\-search\-index < y | n >"
+Enable or disable indexed search for search actions. This option is
+enabled by default. The search index needs to be regenerated by
+\fBegencache\fR(1) after changes are made to a repository (see the
+\fB\-\-update\-pkg\-desc\-index\fR action). This setting can be added
+to \fBEMERGE_DEFAULT_OPTS\fR (see \fBmake.conf\fR(5)) and later
+overridden via the command line.
+.TP
 .BR "\-\-select [ y | n ] (\-w short option)"
 Add specified packages to the world set (inverse of
 \fB\-\-oneshot\fR). This is useful if you want to
diff --git a/pym/_emerge/actions.py b/pym/_emerge/actions.py
index 48b0826..8a22ab5 100644
--- a/pym/_emerge/actions.py
+++ b/pym/_emerge/actions.py
@@ -2015,7 +2015,8 @@ def action_search(root_config, myopts, myfiles, spinner):
 		searchinstance = search(root_config,
 			spinner, "--searchdesc" in myopts,
 			"--quiet" not in myopts, "--usepkg" in myopts,
-			"--usepkgonly" in myopts)
+			"--usepkgonly" in myopts,
+			search_index = myopts.get("--search-index", "y") != "n")
 		for mysearch in myfiles:
 			try:
 				searchinstance.execute(mysearch)
diff --git a/pym/_emerge/depgraph.py b/pym/_emerge/depgraph.py
index 78b9236..2fbb7ce 100644
--- a/pym/_emerge/depgraph.py
+++ b/pym/_emerge/depgraph.py
@@ -8596,7 +8596,7 @@ def ambiguous_package_name(arg, atoms, root_config, spinner, myopts):
 
 	s = search(root_config, spinner, "--searchdesc" in myopts,
 		"--quiet" not in myopts, "--usepkg" in myopts,
-		"--usepkgonly" in myopts)
+		"--usepkgonly" in myopts, search_index = False)
 	null_cp = portage.dep_getkey(insert_category_into_atom(
 		arg, "null"))
 	cat, atom_pn = portage.catsplit(null_cp)
diff --git a/pym/_emerge/main.py b/pym/_emerge/main.py
index cf7966c..c08e12a 100644
--- a/pym/_emerge/main.py
+++ b/pym/_emerge/main.py
@@ -616,6 +616,11 @@ def parse_opts(tmpcmdline, silent=False):
 			"choices" :("True", "rdeps")
 		},
 
+		"--search-index": {
+			"help": "Enable or disable indexed search (enabled by default)",
+			"choices": y_or_n
+		},
+
 		"--select": {
 			"shortopt" : "-w",
 			"help"    : "add specified packages to the world set " + \
diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py
index 4b0fd9f..1d710ee 100644
--- a/pym/_emerge/search.py
+++ b/pym/_emerge/search.py
@@ -7,9 +7,12 @@ import re
 import portage
 from portage import os
 from portage.dbapi.porttree import _parse_uri_map
+from portage.dbapi.IndexedPortdb import IndexedPortdb
+from portage.dbapi.IndexedVardb import IndexedVardb
 from portage.localization import localized_size
 from portage.output import  bold, bold as white, darkgreen, green, red
 from portage.util import writemsg_stdout
+from portage.util.iterators.MultiIterGroupBy import MultiIterGroupBy
 
 from _emerge.Package import Package
 
@@ -25,15 +28,17 @@ class search(object):
 	# public interface
 	#
 	def __init__(self, root_config, spinner, searchdesc,
-		verbose, usepkg, usepkgonly):
+		verbose, usepkg, usepkgonly, search_index = True):
 		"""Searches the available and installed packages for the supplied search key.
 		The list of available and installed packages is created at object instantiation.
 		This makes successive searches faster."""
 		self.settings = root_config.settings
-		self.vartree = root_config.trees["vartree"]
-		self.spinner = spinner
 		self.verbose = verbose
 		self.searchdesc = searchdesc
+		self.searchkey = None
+		# Disable the spinner since search results are displayed
+		# incrementally.
+		self.spinner = None
 		self.root_config = root_config
 		self.setconfig = root_config.setconfig
 		self.matches = {"pkg" : []}
@@ -45,6 +50,10 @@ class search(object):
 		bindb = root_config.trees["bintree"].dbapi
 		vardb = root_config.trees["vartree"].dbapi
 
+		if search_index:
+			portdb = IndexedPortdb(portdb)
+			vardb = IndexedVardb(vardb)
+
 		if not usepkgonly and portdb._have_root_eclass_dir:
 			self._dbs.append(portdb)
 
@@ -53,16 +62,23 @@ class search(object):
 
 		self._dbs.append(vardb)
 		self._portdb = portdb
+		self._vardb = vardb
 
 	def _spinner_update(self):
 		if self.spinner:
 			self.spinner.update()
 
 	def _cp_all(self):
-		cp_all = set()
+		iterators = []
 		for db in self._dbs:
-			cp_all.update(db.cp_all())
-		return list(sorted(cp_all))
+			i = db.cp_all()
+			try:
+				i = iter(i)
+			except TypeError:
+				pass
+			iterators.append(i)
+		for group in MultiIterGroupBy(iterators):
+			yield group[0]
 
 	def _aux_get(self, *args, **kwargs):
 		for db in self._dbs:
@@ -97,7 +113,7 @@ class search(object):
 		return {}
 
 	def _visible(self, db, cpv, metadata):
-		installed = db is self.vartree.dbapi
+		installed = db is self._vardb
 		built = installed or db is not self._portdb
 		pkg_type = "ebuild"
 		if installed:
@@ -171,8 +187,11 @@ class search(object):
 
 	def execute(self,searchkey):
 		"""Performs the search for the supplied search key"""
+		self.searchkey = searchkey
+
+	def _iter_search(self):
+
 		match_category = 0
-		self.searchkey=searchkey
 		self.packagematches = []
 		if self.searchdesc:
 			self.searchdesc=1
@@ -181,6 +200,7 @@ class search(object):
 			self.searchdesc=0
 			self.matches = {"pkg":[], "set":[]}
 		print("Searching...   ", end=' ')
+		print()
 
 		regexsearch = False
 		if self.searchkey.startswith('%'):
@@ -206,8 +226,24 @@ class search(object):
 			if self.searchre.search(match_string):
 				if not self._xmatch("match-visible", package):
 					masked=1
-				self.matches["pkg"].append([package,masked])
+				yield ("pkg", package, masked)
 			elif self.searchdesc: # DESCRIPTION searching
+				# Check for DESCRIPTION match first, so that we can skip
+				# the expensive visiblity check if it doesn't match.
+				full_package = self._xmatch("match-all", package)
+				if not full_package:
+					continue
+				full_package = full_package[-1]
+				try:
+					full_desc = self._aux_get(
+						full_package, ["DESCRIPTION"])[0]
+				except KeyError:
+					portage.writemsg(
+						"emerge: search: aux_get() failed, skipping\n",
+						noiselevel=-1)
+					continue
+				if not self.searchre.search(full_desc):
+					continue
 				full_package = self._xmatch("bestmatch-visible", package)
 				if not full_package:
 					#no match found; we don't want to query description
@@ -217,14 +253,8 @@ class search(object):
 						continue
 					else:
 						masked=1
-				try:
-					full_desc = self._aux_get(
-						full_package, ["DESCRIPTION"])[0]
-				except KeyError:
-					print("emerge: search: aux_get() failed, skipping")
-					continue
-				if self.searchre.search(full_desc):
-					self.matches["desc"].append([full_package,masked])
+
+				yield ("desc", full_package, masked)
 
 		self.sdict = self.setconfig.getSets()
 		for setname in self.sdict:
@@ -235,16 +265,11 @@ class search(object):
 				match_string = setname.split("/")[-1]
 			
 			if self.searchre.search(match_string):
-				self.matches["set"].append([setname, False])
+				yield ("set", setname, False)
 			elif self.searchdesc:
 				if self.searchre.search(
 					self.sdict[setname].getMetadata("DESCRIPTION")):
-					self.matches["set"].append([setname, False])
-			
-		self.mlen=0
-		for mtype in self.matches:
-			self.matches[mtype].sort()
-			self.mlen += len(self.matches[mtype])
+					yield ("set", setname, False)
 
 	def addCP(self, cp):
 		if not self._xmatch("match-all", cp):
@@ -257,17 +282,32 @@ class search(object):
 
 	def output(self):
 		"""Outputs the results of the search."""
-		msg = []
+
+		class msg(object):
+			@staticmethod
+			def append(msg):
+				writemsg_stdout(msg, noiselevel=-1)
+
 		msg.append("\b\b  \n[ Results for search key : " + \
 			bold(self.searchkey) + " ]\n")
-		msg.append("[ Applications found : " + \
-			bold(str(self.mlen)) + " ]\n\n")
-		vardb = self.vartree.dbapi
+		vardb = self._vardb
 		metadata_keys = set(Package.metadata_keys)
 		metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"])
 		metadata_keys = tuple(metadata_keys)
-		for mtype in self.matches:
-			for match,masked in self.matches[mtype]:
+
+		if self.searchkey is None:
+			# Handle results added via addCP
+			addCP_matches = []
+			for mytype, (match, masked) in self.matches.items():
+				addCP_matches.append(mytype, match, masked)
+			iterator = iter(addCP_matches)
+
+		else:
+			# Do a normal search
+			iterator = self._iter_search()
+
+		for mtype, match, masked in iterator:
+				self.mlen += 1
 				full_package = None
 				if mtype == "pkg":
 					full_package = self._xmatch(
@@ -367,12 +407,26 @@ class search(object):
 							+ "   " + desc + "\n")
 						msg.append("      " + darkgreen("License:") + \
 							"       " + license + "\n\n")
-		writemsg_stdout(''.join(msg), noiselevel=-1)
+
+		msg.append("[ Applications found : " + \
+			bold(str(self.mlen)) + " ]\n\n")
+
+		# This method can be called multiple times, so
+		# reset the match count for the next call. Don't
+		# reset it at the beginning of this method, since
+		# that would lose modfications from the addCP
+		# method.
+		self.mlen = 0
+
 	#
 	# private interface
 	#
 	def getInstallationStatus(self,package):
-		installed_package = self.vartree.dep_bestmatch(package)
+		installed_package = self._vardb.match(package)
+		if installed_package:
+			installed_package = installed_package[-1]
+		else:
+			installed_package = ""
 		result = ""
 		version = self.getVersion(installed_package,search.VERSION_RELEASE)
 		if len(version) > 0:
@@ -391,4 +445,3 @@ class search(object):
 		else:
 			result = ""
 		return result
-
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 2/5] Add IndexStreamIterator and MultiIterGroupBy.
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 2/5] Add IndexStreamIterator and MultiIterGroupBy Zac Medico
@ 2014-11-02  0:18     ` Zac Medico
  2014-11-02 22:50     ` [gentoo-portage-dev] [PATCH 2/5 v3] " Zac Medico
  2014-11-03  3:07     ` [gentoo-portage-dev] [PATCH 2/5 v4] " Zac Medico
  2 siblings, 0 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-02  0:18 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

This IndexStreamIterator class can be used together with the
pkg_desc_index_line_read function to read an index file incrementally
as a stream.

The MultiIterGroupBy class can be used to iterate over multiple
IndexStreamIterator instances at once, incrementally grouping results
for a particular package from multiple indices, while limiting the
amount of any given index that must be in memory at once.

Both of these classes are used by the  IndexedPortdb class in the next
patch of this series.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
This updated patch includes a logic fix to ensure that all buffered
objects are yielded when all of the iterators are exhausted.

 pym/portage/cache/index/IndexStreamIterator.py | 27 +++++++++
 pym/portage/util/iterators/MultiIterGroupBy.py | 82 ++++++++++++++++++++++++++
 pym/portage/util/iterators/__init__.py         |  2 +
 3 files changed, 111 insertions(+)
 create mode 100644 pym/portage/cache/index/IndexStreamIterator.py
 create mode 100644 pym/portage/util/iterators/MultiIterGroupBy.py
 create mode 100644 pym/portage/util/iterators/__init__.py

diff --git a/pym/portage/cache/index/IndexStreamIterator.py b/pym/portage/cache/index/IndexStreamIterator.py
new file mode 100644
index 0000000..972aee1
--- /dev/null
+++ b/pym/portage/cache/index/IndexStreamIterator.py
@@ -0,0 +1,27 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+class IndexStreamIterator(object):
+
+	def __init__(self, f, parser):
+
+		self.parser = parser
+		self._file = f
+
+	def close(self):
+
+		if self._file is not None:
+			self._file.close()
+			self._file = None
+
+	def __iter__(self):
+
+		try:
+
+			for line in self._file:
+				node = self.parser(line)
+				if node is not None:
+					yield node
+
+		finally:
+			self.close()
diff --git a/pym/portage/util/iterators/MultiIterGroupBy.py b/pym/portage/util/iterators/MultiIterGroupBy.py
new file mode 100644
index 0000000..ece7a4c
--- /dev/null
+++ b/pym/portage/util/iterators/MultiIterGroupBy.py
@@ -0,0 +1,82 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+class MultiIterGroupBy(object):
+	"""
+	This class functions similarly to the itertools.groupby function,
+	except that it takes multiple source iterators as input. The source
+	iterators must yield objects in sorted order. A group is yielded as
+	soon as the progress of all iterators reaches a state which
+	guarantees that there can not be any remaining (unseen) elements of
+	the group. This is useful for incremental display of grouped search
+	results.
+	"""
+
+	def __init__(self, iterators, key = None):
+		self._iterators = iterators
+		self._key = key
+
+	def __iter__(self):
+
+		progress = []
+		iterators = self._iterators[:]
+		for index in iterators:
+			progress.append(None)
+
+		key_map = {}
+		eof = []
+		key_getter = self._key
+		if key_getter is None:
+			key_getter = lambda x: x
+		max_progress = None
+
+		while iterators:
+			min_progress = None
+			for i, index in enumerate(iterators):
+
+				if max_progress is not None and \
+					max_progress == progress[i] and \
+					min_progress is not None and \
+					max_progress != min_progress:
+					# This one has the most progress,
+					# so allow the others to catch up.
+					continue
+
+				for entry in index:
+					progress[i] = key_getter(entry)
+					key_group = key_map.get(key_getter(entry))
+					if key_group is None:
+						key_group = []
+						key_map[key_getter(entry)] = key_group
+
+					key_group.append(entry)
+
+					if min_progress is None or \
+						key_getter(entry) < min_progress:
+						min_progress = key_getter(entry)
+
+					if max_progress is None or \
+						key_getter(entry) >= max_progress:
+						max_progress = key_getter(entry)
+						# This one has the most progress,
+						# so allow the others to catch up.
+						break
+
+				else:
+					eof.append(i)
+
+			if eof:
+				for i in reversed(eof):
+					del iterators[i]
+					del progress[i]
+				del eof[:]
+
+			yield_these = []
+			for k in key_map:
+				if not iterators or k <= min_progress:
+					yield_these.append(k)
+
+			if yield_these:
+				yield_these.sort()
+				for k in yield_these:
+					yield key_map.pop(k)
diff --git a/pym/portage/util/iterators/__init__.py b/pym/portage/util/iterators/__init__.py
new file mode 100644
index 0000000..7cd880e
--- /dev/null
+++ b/pym/portage/util/iterators/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 2/5 v3] Add IndexStreamIterator and MultiIterGroupBy.
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 2/5] Add IndexStreamIterator and MultiIterGroupBy Zac Medico
  2014-11-02  0:18     ` Zac Medico
@ 2014-11-02 22:50     ` Zac Medico
  2014-11-03  3:07     ` [gentoo-portage-dev] [PATCH 2/5 v4] " Zac Medico
  2 siblings, 0 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-02 22:50 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

This IndexStreamIterator class can be used together with the
pkg_desc_index_line_read function to read an index file incrementally
as a stream.

The MultiIterGroupBy class can be used to iterate over multiple
IndexStreamIterator instances at once, incrementally grouping results
for a particular package from multiple indices, while limiting the
amount of any given index that must be in memory at once.

Both of these classes are used by the  IndexedPortdb class in the next
patch of this series.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
This updated patch cleans up the logic (possibly fixing bugs), and
optimizes it to avoid over-buffering (memory waste). Also, I added
a TODO note to use a binary search tree to optimize the search
for completed groups.

 pym/portage/cache/index/IndexStreamIterator.py | 27 ++++++++
 pym/portage/util/iterators/MultiIterGroupBy.py | 94 ++++++++++++++++++++++++++
 pym/portage/util/iterators/__init__.py         |  2 +
 3 files changed, 123 insertions(+)
 create mode 100644 pym/portage/cache/index/IndexStreamIterator.py
 create mode 100644 pym/portage/util/iterators/MultiIterGroupBy.py
 create mode 100644 pym/portage/util/iterators/__init__.py

diff --git a/pym/portage/cache/index/IndexStreamIterator.py b/pym/portage/cache/index/IndexStreamIterator.py
new file mode 100644
index 0000000..972aee1
--- /dev/null
+++ b/pym/portage/cache/index/IndexStreamIterator.py
@@ -0,0 +1,27 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+class IndexStreamIterator(object):
+
+	def __init__(self, f, parser):
+
+		self.parser = parser
+		self._file = f
+
+	def close(self):
+
+		if self._file is not None:
+			self._file.close()
+			self._file = None
+
+	def __iter__(self):
+
+		try:
+
+			for line in self._file:
+				node = self.parser(line)
+				if node is not None:
+					yield node
+
+		finally:
+			self.close()
diff --git a/pym/portage/util/iterators/MultiIterGroupBy.py b/pym/portage/util/iterators/MultiIterGroupBy.py
new file mode 100644
index 0000000..2d8652e
--- /dev/null
+++ b/pym/portage/util/iterators/MultiIterGroupBy.py
@@ -0,0 +1,94 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+class MultiIterGroupBy(object):
+	"""
+	This class functions similarly to the itertools.groupby function,
+	except that it takes multiple source iterators as input. The source
+	iterators must yield objects in sorted order. A group is yielded as
+	soon as the progress of all iterators reaches a state which
+	guarantees that there can not be any remaining (unseen) elements of
+	the group. This is useful for incremental display of grouped search
+	results.
+	"""
+
+	def __init__(self, iterators, key = None):
+		self._iterators = iterators
+		self._key = key
+
+	def __iter__(self):
+
+		trackers = []
+		for iterator in self._iterators:
+			trackers.append(_IteratorTracker(iterator))
+
+		key_map = {}
+		eof = []
+		key_getter = self._key
+		if key_getter is None:
+			key_getter = lambda x: x
+		min_progress = None
+
+		while trackers:
+
+			for tracker in trackers:
+
+				if tracker.current is not None and \
+					tracker.current != min_progress:
+					# The trackers are sorted by progress, so the
+					# remaining trackers are guaranteed to have
+					# sufficient progress.
+					continue
+
+				# In order to avoid over-buffering (waste of memory),
+				# only grab a single entry.
+				try:
+					entry = next(tracker.iterator)
+				except StopIteration:
+					eof.append(tracker)
+				else:
+					tracker.current = key_getter(entry)
+					key_group = key_map.get(tracker.current)
+					if key_group is None:
+						key_group = []
+						key_map[tracker.current] = key_group
+					key_group.append(entry)
+
+			if eof:
+				for tracker in eof:
+					trackers.remove(tracker)
+				del eof[:]
+
+			if trackers:
+				trackers.sort()
+				min_progress = trackers[0].current
+				yield_these = []
+				# TODO: Use a binary search tree to optimize this loop.
+				for k in key_map:
+					if k <= min_progress:
+						yield_these.append(k)
+			else:
+				yield_these = list(key_map)
+
+			if yield_these:
+				yield_these.sort()
+				for k in yield_these:
+					yield key_map.pop(k)
+
+class _IteratorTracker(object):
+
+	__slots__ = ('current', 'iterator')
+
+	def __init__(self, iterator):
+
+		self.iterator = iterator
+		self.current = None
+
+	def __lt__(self, other):
+		if self.current is None:
+			if other.current is None:
+				return False
+			else:
+				return True
+		return other.current is not None and \
+			self.current < other.current
diff --git a/pym/portage/util/iterators/__init__.py b/pym/portage/util/iterators/__init__.py
new file mode 100644
index 0000000..7cd880e
--- /dev/null
+++ b/pym/portage/util/iterators/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 2/5 v4] Add IndexStreamIterator and MultiIterGroupBy.
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 2/5] Add IndexStreamIterator and MultiIterGroupBy Zac Medico
  2014-11-02  0:18     ` Zac Medico
  2014-11-02 22:50     ` [gentoo-portage-dev] [PATCH 2/5 v3] " Zac Medico
@ 2014-11-03  3:07     ` Zac Medico
  2 siblings, 0 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-03  3:07 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

This IndexStreamIterator class can be used together with the
pkg_desc_index_line_read function to read an index file incrementally
as a stream.

The MultiIterGroupBy class can be used to iterate over multiple
IndexStreamIterator instances at once, incrementally grouping results
for a particular package from multiple indices, while limiting the
amount of any given index that must be in memory at once.

Both of these classes are used by the  IndexedPortdb class in the next
patch of this series.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
This updated patch uses python's bisect module to optimize the search for
completed groups.

 pym/portage/cache/index/IndexStreamIterator.py | 27 +++++++
 pym/portage/util/iterators/MultiIterGroupBy.py | 97 ++++++++++++++++++++++++++
 pym/portage/util/iterators/__init__.py         |  2 +
 3 files changed, 126 insertions(+)
 create mode 100644 pym/portage/cache/index/IndexStreamIterator.py
 create mode 100644 pym/portage/util/iterators/MultiIterGroupBy.py
 create mode 100644 pym/portage/util/iterators/__init__.py

diff --git a/pym/portage/cache/index/IndexStreamIterator.py b/pym/portage/cache/index/IndexStreamIterator.py
new file mode 100644
index 0000000..972aee1
--- /dev/null
+++ b/pym/portage/cache/index/IndexStreamIterator.py
@@ -0,0 +1,27 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+class IndexStreamIterator(object):
+
+	def __init__(self, f, parser):
+
+		self.parser = parser
+		self._file = f
+
+	def close(self):
+
+		if self._file is not None:
+			self._file.close()
+			self._file = None
+
+	def __iter__(self):
+
+		try:
+
+			for line in self._file:
+				node = self.parser(line)
+				if node is not None:
+					yield node
+
+		finally:
+			self.close()
diff --git a/pym/portage/util/iterators/MultiIterGroupBy.py b/pym/portage/util/iterators/MultiIterGroupBy.py
new file mode 100644
index 0000000..f5f8278
--- /dev/null
+++ b/pym/portage/util/iterators/MultiIterGroupBy.py
@@ -0,0 +1,97 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+import bisect
+
+class MultiIterGroupBy(object):
+	"""
+	This class functions similarly to the itertools.groupby function,
+	except that it takes multiple source iterators as input. The source
+	iterators must yield objects in sorted order. A group is yielded as
+	soon as the progress of all iterators reaches a state which
+	guarantees that there can not be any remaining (unseen) elements of
+	the group. This is useful for incremental display of grouped search
+	results.
+	"""
+
+	def __init__(self, iterators, key = None):
+		self._iterators = iterators
+		self._key = key
+
+	def __iter__(self):
+
+		trackers = []
+		for iterator in self._iterators:
+			trackers.append(_IteratorTracker(iterator))
+
+		key_map = {}
+		key_list = []
+		eof = []
+		key_getter = self._key
+		if key_getter is None:
+			key_getter = lambda x: x
+		min_progress = None
+
+		while trackers:
+
+			for tracker in trackers:
+
+				if tracker.current is not None and \
+					tracker.current != min_progress:
+					# The trackers are sorted by progress, so the
+					# remaining trackers are guaranteed to have
+					# sufficient progress.
+					break
+
+				# In order to avoid over-buffering (waste of memory),
+				# only grab a single entry.
+				try:
+					entry = next(tracker.iterator)
+				except StopIteration:
+					eof.append(tracker)
+				else:
+					tracker.current = key_getter(entry)
+					key_group = key_map.get(tracker.current)
+					if key_group is None:
+						key_group = []
+						key_map[tracker.current] = key_group
+						bisect.insort(key_list, tracker.current)
+					key_group.append(entry)
+
+			if eof:
+				for tracker in eof:
+					trackers.remove(tracker)
+				del eof[:]
+
+			if trackers:
+				trackers.sort()
+				min_progress = trackers[0].current
+				# yield if key <= min_progress
+				i = bisect.bisect_right(key_list, min_progress)
+				yield_these = key_list[:i]
+				del key_list[:i]
+			else:
+				yield_these = key_list
+				key_list = []
+
+			if yield_these:
+				for k in yield_these:
+					yield key_map.pop(k)
+
+class _IteratorTracker(object):
+
+	__slots__ = ('current', 'iterator')
+
+	def __init__(self, iterator):
+
+		self.iterator = iterator
+		self.current = None
+
+	def __lt__(self, other):
+		if self.current is None:
+			if other.current is None:
+				return False
+			else:
+				return True
+		return other.current is not None and \
+			self.current < other.current
diff --git a/pym/portage/util/iterators/__init__.py b/pym/portage/util/iterators/__init__.py
new file mode 100644
index 0000000..7cd880e
--- /dev/null
+++ b/pym/portage/util/iterators/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [gentoo-portage-dev]
  2014-11-01 22:46 ` [gentoo-portage-dev] Zac Medico
                     ` (4 preceding siblings ...)
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 5/5] Add emerge --search-index option Zac Medico
@ 2014-11-03 21:42   ` Brian Dolbec
  2014-11-04  9:19     ` [gentoo-portage-dev] Zac Medico
  5 siblings, 1 reply; 29+ messages in thread
From: Brian Dolbec @ 2014-11-03 21:42 UTC (permalink / raw
  To: gentoo-portage-dev

On Sat,  1 Nov 2014 15:46:18 -0700
Zac Medico <zmedico@gentoo.org> wrote:

> In addition to indexed search, this patch series implements
> stream-based incremental display of search results (fixing bug
> #412471).
> 
> I'll be maintaining this patch series in the following branch:
> 
> 	https://github.com/zmedico/portage/tree/bug_525718_stream
> 
> The old non-incremental version of this patch series is still
> available here:
> 
> 	https://github.com/zmedico/portage/tree/bug_525718
> 
> 

Zac, while the code looks good, testing has showed a significant drop
in performance for a -s search while it is greatly improves for a -S
search.

big_daddy portage # time emerge -s porthole
!!! Repository 'vdr-devel' has sync-type attribute set to unsupported value: 'layman'
!!! Repository 'wtk' has sync-type attribute set to unsupported value: 'layman'

[ Results for search key : porthole ]
Searching...    
*  app-portage/porthole
      Latest version available: 0.6.1-r3
      Latest version installed: 0.6.1-r3
      Size of files: 937 KiB
      Homepage:      http://porthole.sourceforge.net
      Description:   A GTK+-based frontend to Portage
      License:       GPL-2

[ Applications found : 1 ]

real	0m5.509s
user	0m5.181s
sys	0m0.321s
big_daddy portage # time /usr/bin/emerge -s porthole
!!! Repository 'vdr-devel' has sync-type attribute set to unsupported value: 'layman'
!!! Repository 'wtk' has sync-type attribute set to unsupported value: 'layman'
Searching...    
[ Results for search key : porthole ]
[ Applications found : 1 ]

*  app-portage/porthole
      Latest version available: 0.6.1-r3
      Latest version installed: 0.6.1-r3
      Size of files: 937 KiB
      Homepage:      http://porthole.sourceforge.net
      Description:   A GTK+-based frontend to Portage
      License:       GPL-2

real	0m3.431s
user	0m3.024s
sys	0m0.401s
big_daddy portage # emerge --version
!!! Repository 'vdr-devel' has sync-type attribute set to unsupported value: 'layman'
!!! Repository 'wtk' has sync-type attribute set to unsupported value: 'layman'
Portage 2.2.14_p28 (python 3.3.5-final-0, default/linux/amd64/13.0/desktop, gcc-4.8.3, glibc-2.19-r1, 3.16.1-gentoo x86_64)
big_daddy portage # /usr/bin/emerge --version
!!! Repository 'vdr-devel' has sync-type attribute set to unsupported value: 'layman'
!!! Repository 'wtk' has sync-type attribute set to unsupported value: 'layman'
Portage 2.2.14 (python 3.3.5-final-0, default/linux/amd64/13.0/desktop, gcc-4.8.3, glibc-2.19-r1, 3.16.1-gentoo x86_64)
big_daddy portage # time esearch porthole
[ Results for search key : porthole ]
[ Applications found : 1 ]

*  app-portage/porthole
      Latest version available: 0.6.1-r3
      Latest version installed: 0.6.1-r3
      Size of downloaded files: 936 kB
      Homepage:    http://porthole.sourceforge.net
      Description: A GTK+-based frontend to Portage
      License:     GPL-2

real	0m0.189s
user	0m0.149s
sys	0m0.038s
big_daddy portage # 

========================

times for -S search

========================

big_daddy portage # time /usr/bin/emerge -S porthole
!!! Repository 'vdr-devel' has sync-type attribute set to unsupported value: 'layman'
!!! Repository 'wtk' has sync-type attribute set to unsupported value: 'layman'
Searching...   / * ERROR: app-portage/c-layman-9999::gentoo-guis failed (depend phase):
 *   git.eclass could not be found by inherit()
 * 
 * Call stack:
 *              ebuild.sh, line 550:  Called source '/home/brian/Dev/git/gentoo-guis/app-portage/c-layman/c-layman-9999.ebuild'
 *   c-layman-9999.ebuild, line   9:  Called inherit 'git'
 *              ebuild.sh, line 257:  Called die
 * The specific snippet of code:
 *   		[[ -z ${location} ]] && die "${1}.eclass could not be found by inherit()"
 * 
 * If you need support, post the output of `emerge --info '=app-portage/c-layman-9999::gentoo-guis'`,
 * the complete build log and the output of `emerge -pqv '=app-portage/c-layman-9999::gentoo-guis'`.
 * Working directory: '/usr/lib64/python3.3/site-packages'
 * S: '/var/tmp/portage/app-portage/c-layman-9999/work/c-layman-9999'
emerge: search: aux_get() failed, skipping
 | * Manifest not found for '/home/brian/Dev/git/gentoo-guis/app-portage/ufed/ufed-9999.ebuild'
 | * ERROR: dev-vcs/git-bzr-9999::The-Pit failed (depend phase):
 *   git.eclass could not be found by inherit()
 * 
 * Call stack:
 *             ebuild.sh, line 550:  Called source '/usr/local/portage/dev-vcs/git-bzr/git-bzr-9999.ebuild'
 *   git-bzr-9999.ebuild, line   4:  Called inherit 'git'
 *             ebuild.sh, line 257:  Called die
 * The specific snippet of code:
 *   		[[ -z ${location} ]] && die "${1}.eclass could not be found by inherit()"
 * 
 * If you need support, post the output of `emerge --info '=dev-vcs/git-bzr-9999::The-Pit'`,
 * the complete build log and the output of `emerge -pqv '=dev-vcs/git-bzr-9999::The-Pit'`.
 * Working directory: '/usr/lib64/python3.3/site-packages'
 * S: '/var/tmp/portage/dev-vcs/git-bzr-9999/work/git-bzr-9999'
emerge: search: aux_get() failed, skipping

[ Results for search key : porthole ]
[ Applications found : 1 ]

*  app-portage/porthole
      Latest version available: 0.6.1-r3
      Latest version installed: 0.6.1-r3
      Size of files: 937 KiB
      Homepage:      http://porthole.sourceforge.net
      Description:   A GTK+-based frontend to Portage
      License:       GPL-2

real	1m15.121s
user	1m1.664s
sys	0m12.891s
big_daddy portage # time emerge -S porthole
!!! Repository 'vdr-devel' has sync-type attribute set to unsupported value: 'layman'
!!! Repository 'wtk' has sync-type attribute set to unsupported value: 'layman'

[ Results for search key : porthole ]
Searching...    
 * ERROR: app-portage/c-layman-9999::gentoo-guis failed (depend phase):
 *   git.eclass could not be found by inherit()
 * 
 * Call stack:
 *              ebuild.sh, line 575:  Called source '/home/brian/Dev/git/gentoo-guis/app-portage/c-layman/c-layman-9999.ebuild'
 *   c-layman-9999.ebuild, line   9:  Called inherit 'git'
 *              ebuild.sh, line 257:  Called die
 * The specific snippet of code:
 *   		[[ -z ${location} ]] && die "${1}.eclass could not be found by inherit()"
 * 
 * If you need support, post the output of `emerge --info '=app-portage/c-layman-9999::gentoo-guis'`,
 * the complete build log and the output of `emerge -pqv '=app-portage/c-layman-9999::gentoo-guis'`.
 * Working directory: '/home/brian/Dev/git/portage/pym'
 * S: '/var/tmp/portage/app-portage/c-layman-9999/work/c-layman-9999'
emerge: search: aux_get() failed, skipping
*  app-portage/porthole
      Latest version available: 0.6.1-r3
      Latest version installed: 0.6.1-r3
      Size of files: 937 KiB
      Homepage:      http://porthole.sourceforge.net
      Description:   A GTK+-based frontend to Portage
      License:       GPL-2

 * ERROR: dev-vcs/git-bzr-9999::The-Pit failed (depend phase):
 *   git.eclass could not be found by inherit()
 * 
 * Call stack:
 *             ebuild.sh, line 575:  Called source '/usr/local/portage/dev-vcs/git-bzr/git-bzr-9999.ebuild'
 *   git-bzr-9999.ebuild, line   4:  Called inherit 'git'
 *             ebuild.sh, line 257:  Called die
 * The specific snippet of code:
 *   		[[ -z ${location} ]] && die "${1}.eclass could not be found by inherit()"
 * 
 * If you need support, post the output of `emerge --info '=dev-vcs/git-bzr-9999::The-Pit'`,
 * the complete build log and the output of `emerge -pqv '=dev-vcs/git-bzr-9999::The-Pit'`.
 * Working directory: '/home/brian/Dev/git/portage/pym'
 * S: '/var/tmp/portage/dev-vcs/git-bzr-9999/work/git-bzr-9999'
emerge: search: aux_get() failed, skipping
[ Applications found : 1 ]

real	0m12.515s
user	0m11.795s
sys	0m0.660s
big_daddy portage # 

I know the above times are probably slowed by not having overlays indexed, but it is still a significant speedup.
esearch is only marginally slower doing a -S search than a regular pkg-name search, typically just over 0.2 seconds

The above was done with only the gentoo repo indexed, plus I need to clean out some old ebuilds in overlays.
But I was shocked to see the normal -s searches going from 3.4s to 5.5s with your new index.

Also I did not see a way to specify all repos to be index updated.  It is I believe a requirement of this new system.  Re-running is for each repo installed individually is something to be reserved for the new postsync() to do when it lands in master. 

-- 
Brian Dolbec <dolsen>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 3/5 v2] Add IndexedPortdb class.
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 3/5] Add IndexedPortdb class Zac Medico
@ 2014-11-04  5:07     ` Zac Medico
  2014-11-04 20:34       ` [gentoo-portage-dev] [PATCH 3/5 v3] " Zac Medico
  0 siblings, 1 reply; 29+ messages in thread
From: Zac Medico @ 2014-11-04  5:07 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

The IndexedPortdb class uses pkg_desc_index to optimize searchs for
package names and descriptions. If the package description index is
missing from a particular repository, then all metadata for that
repository is obtained using the normal pordbapi.aux_get method.

This class only implements a subset of portdbapi functionality that is
useful for searching pkg_desc_index incrementally. For this reason,
the cp_all method returns an ordered iterator instead of a list, so
that search results can be displayed incrementally.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
This updated patch has some optimizations in _init_index that should improve
performance for cases where some repositories are not indexed.

 pym/portage/dbapi/IndexedPortdb.py | 153 +++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 pym/portage/dbapi/IndexedPortdb.py

diff --git a/pym/portage/dbapi/IndexedPortdb.py b/pym/portage/dbapi/IndexedPortdb.py
new file mode 100644
index 0000000..e95ff4b
--- /dev/null
+++ b/pym/portage/dbapi/IndexedPortdb.py
@@ -0,0 +1,153 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+import errno
+import io
+import functools
+import operator
+import os
+
+import portage
+from portage import _encodings
+from portage.dep import Atom
+from portage.exception import FileNotFound
+from portage.cache.index.IndexStreamIterator import IndexStreamIterator
+from portage.cache.index.pkg_desc_index import \
+	pkg_desc_index_line_read, pkg_desc_index_node
+from portage.util.iterators.MultiIterGroupBy import MultiIterGroupBy
+from portage.versions import _pkg_str
+
+class IndexedPortdb(object):
+	"""
+	A portdbapi interface that uses a package description index to
+	improve performance. If the description index is missing for a
+	particular repository, then all metadata for that repository is
+	obtained using the normal pordbapi.aux_get method.
+
+	For performance reasons, the match method only supports package
+	name and version constraints. For the same reason, the xmatch
+	method is not implemented.
+	"""
+
+	_copy_attrs = ('cpv_exists', 'findname', 'getFetchMap',
+		'_aux_cache_keys', '_cpv_sort_ascending',
+		'_have_root_eclass_dir')
+
+	def __init__(self, portdb):
+
+		self._portdb = portdb
+
+		for k in self._copy_attrs:
+			setattr(self, k, getattr(portdb, k))
+
+		self._desc_cache = None
+		self._cp_map = None
+
+	def _init_index(self):
+
+		cp_map = {}
+		desc_cache = {}
+		self._desc_cache = desc_cache
+		self._cp_map = cp_map
+		index_missing = []
+
+		streams = []
+		for repo_path in self._portdb.porttrees:
+			outside_repo = os.path.join(self._portdb.depcachedir,
+				repo_path.lstrip(os.sep))
+			filenames = []
+			for parent_dir in (repo_path, outside_repo):
+				filenames.append(os.path.join(parent_dir,
+					"metadata", "pkg_desc_index"))
+
+			repo_name = self._portdb.getRepositoryName(repo_path)
+
+			try:
+				f = None
+				for filename in filenames:
+					try:
+						f = io.open(filename,
+							encoding=_encodings["repo.content"])
+					except IOError as e:
+						if e.errno not in (errno.ENOENT, errno.ESTALE):
+							raise
+					else:
+						break
+
+				if f is None:
+					raise FileNotFound(filename)
+
+				streams.append(iter(IndexStreamIterator(f,
+					functools.partial(pkg_desc_index_line_read,
+					repo = repo_name))))
+			except FileNotFound:
+				index_missing.append(repo_path)
+
+		if index_missing:
+
+			class _NonIndexedStream(object):
+				def __iter__(self_):
+					for cp in self._portdb.cp_all(
+						trees = index_missing):
+						cp_list = self._portdb.cp_list(
+							cp, mytree = index_missing)
+						yield pkg_desc_index_node(cp,
+							tuple(_pkg_str(cpv) for cpv in cp_list),
+							None)
+
+			streams.append(iter(_NonIndexedStream()))
+
+		if streams:
+			if len(streams) == 1:
+				cp_group_iter = ([node] for node in streams[0])
+			else:
+				cp_group_iter = MultiIterGroupBy(streams,
+					key = operator.attrgetter("cp"))
+
+			for cp_group in cp_group_iter:
+
+				new_cp = None
+				cp_list = cp_map.get(cp_group[0].cp)
+				if cp_list is None:
+					new_cp = cp_group[0].cp
+					cp_list = []
+					cp_map[cp_group[0].cp] = cp_list
+
+				for entry in cp_group:
+					cp_list.extend(entry.cpv_list)
+					if entry.desc is not None:
+						for cpv in entry.cpv_list:
+							desc_cache[cpv] = entry.desc
+
+				if new_cp is not None:
+					yield cp_group[0].cp
+
+	def cp_all(self):
+		"""
+		Returns an ordered iterator instead of a list, so that search
+		results can be displayed incrementally.
+		"""
+		if self._cp_map is None:
+			return self._init_index()
+		return iter(sorted(self._cp_map))
+
+	def match(self, atom):
+		"""
+		For performance reasons, only package name and version
+		constraints are supported.
+		"""
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+		self._portdb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		if len(attrs) == 1 and attrs[0] == "DESCRIPTION":
+			try:
+				return [self._desc_cache[cpv]]
+			except KeyError:
+				pass
+		return self._portdb.aux_get(cpv, attrs)
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 5/5 v3] Add emerge --search-index option.
  2014-11-01 23:04     ` Zac Medico
@ 2014-11-04  5:42       ` Zac Medico
  2014-11-04  9:10         ` [gentoo-portage-dev] " Zac Medico
  0 siblings, 1 reply; 29+ messages in thread
From: Zac Medico @ 2014-11-04  5:42 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

The new emerge --search-index option, which is enabled by default,
causes pkg_desc_index to be used for search optimization. The search
index needs to be regenerated by egencache after changes are made to
a repository (see the --update-pkg-desc-index action).

For users that would like to modify ebuilds in a repository without
running egencache afterwards, emerge --search-index=n can be used to
get non-indexed search. Alternatively, the user could simply remove
the stale index file, in order to disable the search index for a
particular repository.

In order to conserve memory, indices are read as streams, and
MultiIterGroupBy is used to group results from IndexedPortdb and
IndexedVardb. Stream-oriented search also makes it possible to
display search results incrementally (fixing bug #412471).

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
This updated patch causes indexed search to be enabled only for
searchdesc, since indexed variants can actually be slower when
only package names need to be searched.

 man/emerge.1            |   8 ++++
 pym/_emerge/actions.py  |   3 +-
 pym/_emerge/depgraph.py |   2 +-
 pym/_emerge/main.py     |   5 ++
 pym/_emerge/search.py   | 122 +++++++++++++++++++++++++++++++++++-------------
 5 files changed, 105 insertions(+), 35 deletions(-)

diff --git a/man/emerge.1 b/man/emerge.1
index bbe71ac..7bcdd9a 100644
--- a/man/emerge.1
+++ b/man/emerge.1
@@ -796,6 +796,14 @@ If ebuilds using EAPIs which \fIdo not\fR support \fBHDEPEND\fR are built in
 the same \fBemerge\fR run as those using EAPIs which \fIdo\fR support
 \fBHDEPEND\fR, this option affects only the former.
 .TP
+.BR "\-\-search\-index < y | n >"
+Enable or disable indexed search for search actions. This option is
+enabled by default. The search index needs to be regenerated by
+\fBegencache\fR(1) after changes are made to a repository (see the
+\fB\-\-update\-pkg\-desc\-index\fR action). This setting can be added
+to \fBEMERGE_DEFAULT_OPTS\fR (see \fBmake.conf\fR(5)) and later
+overridden via the command line.
+.TP
 .BR "\-\-select [ y | n ] (\-w short option)"
 Add specified packages to the world set (inverse of
 \fB\-\-oneshot\fR). This is useful if you want to
diff --git a/pym/_emerge/actions.py b/pym/_emerge/actions.py
index 48b0826..8a22ab5 100644
--- a/pym/_emerge/actions.py
+++ b/pym/_emerge/actions.py
@@ -2015,7 +2015,8 @@ def action_search(root_config, myopts, myfiles, spinner):
 		searchinstance = search(root_config,
 			spinner, "--searchdesc" in myopts,
 			"--quiet" not in myopts, "--usepkg" in myopts,
-			"--usepkgonly" in myopts)
+			"--usepkgonly" in myopts,
+			search_index = myopts.get("--search-index", "y") != "n")
 		for mysearch in myfiles:
 			try:
 				searchinstance.execute(mysearch)
diff --git a/pym/_emerge/depgraph.py b/pym/_emerge/depgraph.py
index 94eaed8..da408ad 100644
--- a/pym/_emerge/depgraph.py
+++ b/pym/_emerge/depgraph.py
@@ -8656,7 +8656,7 @@ def ambiguous_package_name(arg, atoms, root_config, spinner, myopts):
 
 	s = search(root_config, spinner, "--searchdesc" in myopts,
 		"--quiet" not in myopts, "--usepkg" in myopts,
-		"--usepkgonly" in myopts)
+		"--usepkgonly" in myopts, search_index = False)
 	null_cp = portage.dep_getkey(insert_category_into_atom(
 		arg, "null"))
 	cat, atom_pn = portage.catsplit(null_cp)
diff --git a/pym/_emerge/main.py b/pym/_emerge/main.py
index cf7966c..c08e12a 100644
--- a/pym/_emerge/main.py
+++ b/pym/_emerge/main.py
@@ -616,6 +616,11 @@ def parse_opts(tmpcmdline, silent=False):
 			"choices" :("True", "rdeps")
 		},
 
+		"--search-index": {
+			"help": "Enable or disable indexed search (enabled by default)",
+			"choices": y_or_n
+		},
+
 		"--select": {
 			"shortopt" : "-w",
 			"help"    : "add specified packages to the world set " + \
diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py
index 4b0fd9f..5821c37 100644
--- a/pym/_emerge/search.py
+++ b/pym/_emerge/search.py
@@ -7,9 +7,12 @@ import re
 import portage
 from portage import os
 from portage.dbapi.porttree import _parse_uri_map
+from portage.dbapi.IndexedPortdb import IndexedPortdb
+from portage.dbapi.IndexedVardb import IndexedVardb
 from portage.localization import localized_size
 from portage.output import  bold, bold as white, darkgreen, green, red
 from portage.util import writemsg_stdout
+from portage.util.iterators.MultiIterGroupBy import MultiIterGroupBy
 
 from _emerge.Package import Package
 
@@ -25,15 +28,17 @@ class search(object):
 	# public interface
 	#
 	def __init__(self, root_config, spinner, searchdesc,
-		verbose, usepkg, usepkgonly):
+		verbose, usepkg, usepkgonly, search_index = True):
 		"""Searches the available and installed packages for the supplied search key.
 		The list of available and installed packages is created at object instantiation.
 		This makes successive searches faster."""
 		self.settings = root_config.settings
-		self.vartree = root_config.trees["vartree"]
-		self.spinner = spinner
 		self.verbose = verbose
 		self.searchdesc = searchdesc
+		self.searchkey = None
+		# Disable the spinner since search results are displayed
+		# incrementally.
+		self.spinner = None
 		self.root_config = root_config
 		self.setconfig = root_config.setconfig
 		self.matches = {"pkg" : []}
@@ -45,6 +50,13 @@ class search(object):
 		bindb = root_config.trees["bintree"].dbapi
 		vardb = root_config.trees["vartree"].dbapi
 
+		# The indexed variants can actually be slower when only
+		# package names need to be searched, so only use indices
+		# for searchdesc.
+		if search_index and searchdesc:
+			portdb = IndexedPortdb(portdb)
+			vardb = IndexedVardb(vardb)
+
 		if not usepkgonly and portdb._have_root_eclass_dir:
 			self._dbs.append(portdb)
 
@@ -53,16 +65,23 @@ class search(object):
 
 		self._dbs.append(vardb)
 		self._portdb = portdb
+		self._vardb = vardb
 
 	def _spinner_update(self):
 		if self.spinner:
 			self.spinner.update()
 
 	def _cp_all(self):
-		cp_all = set()
+		iterators = []
 		for db in self._dbs:
-			cp_all.update(db.cp_all())
-		return list(sorted(cp_all))
+			i = db.cp_all()
+			try:
+				i = iter(i)
+			except TypeError:
+				pass
+			iterators.append(i)
+		for group in MultiIterGroupBy(iterators):
+			yield group[0]
 
 	def _aux_get(self, *args, **kwargs):
 		for db in self._dbs:
@@ -97,7 +116,7 @@ class search(object):
 		return {}
 
 	def _visible(self, db, cpv, metadata):
-		installed = db is self.vartree.dbapi
+		installed = db is self._vardb
 		built = installed or db is not self._portdb
 		pkg_type = "ebuild"
 		if installed:
@@ -171,8 +190,11 @@ class search(object):
 
 	def execute(self,searchkey):
 		"""Performs the search for the supplied search key"""
+		self.searchkey = searchkey
+
+	def _iter_search(self):
+
 		match_category = 0
-		self.searchkey=searchkey
 		self.packagematches = []
 		if self.searchdesc:
 			self.searchdesc=1
@@ -181,6 +203,7 @@ class search(object):
 			self.searchdesc=0
 			self.matches = {"pkg":[], "set":[]}
 		print("Searching...   ", end=' ')
+		print()
 
 		regexsearch = False
 		if self.searchkey.startswith('%'):
@@ -206,8 +229,24 @@ class search(object):
 			if self.searchre.search(match_string):
 				if not self._xmatch("match-visible", package):
 					masked=1
-				self.matches["pkg"].append([package,masked])
+				yield ("pkg", package, masked)
 			elif self.searchdesc: # DESCRIPTION searching
+				# Check for DESCRIPTION match first, so that we can skip
+				# the expensive visiblity check if it doesn't match.
+				full_package = self._xmatch("match-all", package)
+				if not full_package:
+					continue
+				full_package = full_package[-1]
+				try:
+					full_desc = self._aux_get(
+						full_package, ["DESCRIPTION"])[0]
+				except KeyError:
+					portage.writemsg(
+						"emerge: search: aux_get() failed, skipping\n",
+						noiselevel=-1)
+					continue
+				if not self.searchre.search(full_desc):
+					continue
 				full_package = self._xmatch("bestmatch-visible", package)
 				if not full_package:
 					#no match found; we don't want to query description
@@ -217,14 +256,8 @@ class search(object):
 						continue
 					else:
 						masked=1
-				try:
-					full_desc = self._aux_get(
-						full_package, ["DESCRIPTION"])[0]
-				except KeyError:
-					print("emerge: search: aux_get() failed, skipping")
-					continue
-				if self.searchre.search(full_desc):
-					self.matches["desc"].append([full_package,masked])
+
+				yield ("desc", full_package, masked)
 
 		self.sdict = self.setconfig.getSets()
 		for setname in self.sdict:
@@ -235,16 +268,11 @@ class search(object):
 				match_string = setname.split("/")[-1]
 			
 			if self.searchre.search(match_string):
-				self.matches["set"].append([setname, False])
+				yield ("set", setname, False)
 			elif self.searchdesc:
 				if self.searchre.search(
 					self.sdict[setname].getMetadata("DESCRIPTION")):
-					self.matches["set"].append([setname, False])
-			
-		self.mlen=0
-		for mtype in self.matches:
-			self.matches[mtype].sort()
-			self.mlen += len(self.matches[mtype])
+					yield ("set", setname, False)
 
 	def addCP(self, cp):
 		if not self._xmatch("match-all", cp):
@@ -257,17 +285,32 @@ class search(object):
 
 	def output(self):
 		"""Outputs the results of the search."""
-		msg = []
+
+		class msg(object):
+			@staticmethod
+			def append(msg):
+				writemsg_stdout(msg, noiselevel=-1)
+
 		msg.append("\b\b  \n[ Results for search key : " + \
 			bold(self.searchkey) + " ]\n")
-		msg.append("[ Applications found : " + \
-			bold(str(self.mlen)) + " ]\n\n")
-		vardb = self.vartree.dbapi
+		vardb = self._vardb
 		metadata_keys = set(Package.metadata_keys)
 		metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"])
 		metadata_keys = tuple(metadata_keys)
-		for mtype in self.matches:
-			for match,masked in self.matches[mtype]:
+
+		if self.searchkey is None:
+			# Handle results added via addCP
+			addCP_matches = []
+			for mytype, (match, masked) in self.matches.items():
+				addCP_matches.append(mytype, match, masked)
+			iterator = iter(addCP_matches)
+
+		else:
+			# Do a normal search
+			iterator = self._iter_search()
+
+		for mtype, match, masked in iterator:
+				self.mlen += 1
 				full_package = None
 				if mtype == "pkg":
 					full_package = self._xmatch(
@@ -367,12 +410,26 @@ class search(object):
 							+ "   " + desc + "\n")
 						msg.append("      " + darkgreen("License:") + \
 							"       " + license + "\n\n")
-		writemsg_stdout(''.join(msg), noiselevel=-1)
+
+		msg.append("[ Applications found : " + \
+			bold(str(self.mlen)) + " ]\n\n")
+
+		# This method can be called multiple times, so
+		# reset the match count for the next call. Don't
+		# reset it at the beginning of this method, since
+		# that would lose modfications from the addCP
+		# method.
+		self.mlen = 0
+
 	#
 	# private interface
 	#
 	def getInstallationStatus(self,package):
-		installed_package = self.vartree.dep_bestmatch(package)
+		installed_package = self._vardb.match(package)
+		if installed_package:
+			installed_package = installed_package[-1]
+		else:
+			installed_package = ""
 		result = ""
 		version = self.getVersion(installed_package,search.VERSION_RELEASE)
 		if len(version) > 0:
@@ -391,4 +448,3 @@ class search(object):
 		else:
 			result = ""
 		return result
-
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 1/5 v2] Add egencache --update-pkg-desc-index action.
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 1/5] Add egencache --update-pkg-desc-index action Zac Medico
@ 2014-11-04  9:03     ` Zac Medico
  0 siblings, 0 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-04  9:03 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

This adds an egencache --update-pkg-desc-index action which generates
a plain-text index of package names, versions, and descriptions. The
index can then be used to optimize emerge --search / --searchdesc
actions.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
This updated patch optimizes pkg_desc_index_line_read to skip package name
and version validation. This fixes a performance problem reported by
Brian Dolbec.

 bin/egencache                             | 38 ++++++++++++++++++--
 man/egencache.1                           |  4 +++
 man/portage.5                             | 12 +++++++
 pym/portage/cache/index/__init__.py       |  2 ++
 pym/portage/cache/index/pkg_desc_index.py | 59 +++++++++++++++++++++++++++++++
 5 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 pym/portage/cache/index/__init__.py
 create mode 100644 pym/portage/cache/index/pkg_desc_index.py

diff --git a/bin/egencache b/bin/egencache
index e366058..f97432f 100755
--- a/bin/egencache
+++ b/bin/egencache
@@ -48,6 +48,7 @@ portage._internal_caller = True
 from portage import os, _encodings, _unicode_encode, _unicode_decode
 from _emerge.MetadataRegen import MetadataRegen
 from portage.cache.cache_errors import CacheError, StatCollision
+from portage.cache.index.pkg_desc_index import pkg_desc_index_line_format
 from portage.const import TIMESTAMP_FORMAT
 from portage.manifest import guessManifestFileType
 from portage.package.ebuild._parallel_manifest.ManifestScheduler import ManifestScheduler
@@ -57,7 +58,7 @@ from portage.util._async.run_main_scheduler import run_main_scheduler
 from portage.util._eventloop.global_event_loop import global_event_loop
 from portage import cpv_getkey
 from portage.dep import Atom, isjustname
-from portage.versions import pkgsplit, vercmp
+from portage.versions import pkgsplit, vercmp, _pkg_str
 
 try:
 	from xml.etree import ElementTree
@@ -91,6 +92,9 @@ def parse_args(args):
 	actions.add_argument("--update-changelogs",
 		action="store_true",
 		help="update the ChangeLog files from SCM logs")
+	actions.add_argument("--update-pkg-desc-index",
+		action="store_true",
+		help="update package description index")
 	actions.add_argument("--update-manifests",
 		action="store_true",
 		help="update manifests")
@@ -451,6 +455,29 @@ class GenCache(object):
 		if hasattr(trg_cache, '_prune_empty_dirs'):
 			trg_cache._prune_empty_dirs()
 
+class GenPkgDescIndex(object):
+	def __init__(self, portdb, output_file):
+		self.returncode = os.EX_OK
+		self._portdb = portdb
+		self._output_file = output_file
+
+	def run(self):
+
+		portage.util.ensure_dirs(os.path.dirname(self._output_file))
+		f = portage.util.atomic_ofstream(self._output_file,
+			encoding=_encodings["repo.content"])
+
+		portdb = self._portdb
+		for cp in portdb.cp_all():
+			pkgs = portdb.cp_list(cp)
+			if not pkgs:
+				continue
+			desc, = portdb.aux_get(pkgs[-1], ["DESCRIPTION"])
+
+			f.write(pkg_desc_index_line_format(cp, pkgs, desc))
+
+		f.close()
+
 class GenUseLocalDesc(object):
 	def __init__(self, portdb, output=None,
 			preserve_comments=False):
@@ -893,7 +920,8 @@ def egencache_main(args):
 			local_config=False, env=env)
 
 	if not (options.update or options.update_use_local_desc or
-			options.update_changelogs or options.update_manifests):
+			options.update_changelogs or options.update_manifests or
+			options.update_pkg_desc_index):
 		parser.error('No action specified')
 		return 1
 
@@ -1057,6 +1085,12 @@ def egencache_main(args):
 		else:
 			ret.append(scheduler.returncode)
 
+	if options.update_pkg_desc_index:
+		gen_index = GenPkgDescIndex(portdb, os.path.join(
+			repo_config.location, "metadata", "pkg_desc_index"))
+		gen_index.run()
+		ret.append(gen_index.returncode)
+
 	if options.update_use_local_desc:
 		gen_desc = GenUseLocalDesc(portdb,
 			output=options.uld_output,
diff --git a/man/egencache.1 b/man/egencache.1
index f71feb3..3a3197f 100644
--- a/man/egencache.1
+++ b/man/egencache.1
@@ -19,6 +19,10 @@ for the details on package atom syntax.
 .BR "\-\-update\-changelogs"
 Update the ChangeLog files from SCM logs (supported only in git repos).
 .TP
+.BR "\-\-update\-pkg\-desc\-index"
+Update the package description index which is located at
+\fImetadata/pkg_desc_index\fR in the repository.
+.TP
 .BR "\-\-update\-use\-local\-desc"
 Update the \fIprofiles/use.local.desc\fR file from metadata.xml.
 .TP
diff --git a/man/portage.5 b/man/portage.5
index 309e259..f2f5243 100644
--- a/man/portage.5
+++ b/man/portage.5
@@ -76,6 +76,7 @@ user\-defined package sets
 .BR /usr/portage/metadata/
 .nf
 layout.conf
+pkg_desc_index
 .fi
 .TP
 .BR /usr/portage/profiles/
@@ -1138,6 +1139,17 @@ cache\-formats = md5-dict pms
 profile\-formats = portage-2
 .fi
 .RE
+.TP
+.BR pkg_desc_index
+This is an index of package names, versions, and descriptions which
+may be generated by \fBegencache\fR(1) in order to optimize
+\fBemerge\fR(1) search actions.
+
+.I Example:
+.nf
+sys-apps/sed 4.2 4.2.1 4.2.1-r1 4.2.2: Super-useful stream editor
+sys-apps/usleep 0.1: A wrapper for usleep
+.fi
 .RE
 .TP
 .BR /usr/portage/profiles/
diff --git a/pym/portage/cache/index/__init__.py b/pym/portage/cache/index/__init__.py
new file mode 100644
index 0000000..7cd880e
--- /dev/null
+++ b/pym/portage/cache/index/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
diff --git a/pym/portage/cache/index/pkg_desc_index.py b/pym/portage/cache/index/pkg_desc_index.py
new file mode 100644
index 0000000..ed2cdf7
--- /dev/null
+++ b/pym/portage/cache/index/pkg_desc_index.py
@@ -0,0 +1,59 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+from __future__ import unicode_literals
+
+import collections
+import sys
+
+from portage.versions import _pkg_str
+
+if sys.hexversion >= 0x3000000:
+	_unicode = str
+else:
+	_unicode = unicode
+
+pkg_desc_index_node = collections.namedtuple("pkg_desc_index_node",
+	["cp", "cpv_list", "desc"])
+
+class pkg_node(_unicode):
+	"""
+	A minimal package node class. For performance reasons, inputs
+	are not validated.
+	"""
+
+	def __init__(self, cp, version, repo = None):
+		self.__dict__['cp'] = cp
+		self.__dict__['repo'] = repo
+		self.__dict__['version'] = version
+
+	def __new__(cls, cp, version, repo = None):
+		return _unicode.__new__(cls, cp + "-" + version)
+
+	def __setattr__(self, name, value):
+		raise AttributeError("pkg_node instances are immutable",
+			self.__class__, name, value)
+
+def pkg_desc_index_line_format(cp, pkgs, desc):
+	return "%s %s: %s\n" % (cp,
+		" ".join(_pkg_str(cpv).version
+		for cpv in pkgs), desc)
+
+def pkg_desc_index_line_read(line, repo = None):
+
+	try:
+		pkgs, desc = line.split(":", 1)
+	except ValueError:
+		return None
+	desc = desc.strip()
+
+	try:
+		cp, pkgs = pkgs.split(" ", 1)
+	except ValueError:
+		return None
+
+	cp_list = []
+	for ver in pkgs.split():
+		cp_list.append(pkg_node(cp, ver, repo))
+
+	return pkg_desc_index_node(cp, tuple(cp_list), desc)
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] Re: [PATCH 5/5 v3] Add emerge --search-index option.
  2014-11-04  5:42       ` [gentoo-portage-dev] [PATCH 5/5 v3] " Zac Medico
@ 2014-11-04  9:10         ` Zac Medico
  0 siblings, 0 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-04  9:10 UTC (permalink / raw
  To: gentoo-portage-dev

On 11/03/2014 09:42 PM, Zac Medico wrote:
> This updated patch causes indexed search to be enabled only for
> searchdesc, since indexed variants can actually be slower when
> only package names need to be searched.

With the recent pkg_desc_index_line_read optimization (removal of
package name and version validation), performance is better if we use
the index for emerge --search too. Therefore, revert back to the
previous version of this patch.
-- 
Thanks,
Zac


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [gentoo-portage-dev]
  2014-11-03 21:42   ` [gentoo-portage-dev] Brian Dolbec
@ 2014-11-04  9:19     ` Zac Medico
  0 siblings, 0 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-04  9:19 UTC (permalink / raw
  To: gentoo-portage-dev

On 11/03/2014 01:42 PM, Brian Dolbec wrote:
> I know the above times are probably slowed by not having overlays indexed, but it is still a significant speedup.
> esearch is only marginally slower doing a -S search than a regular pkg-name search, typically just over 0.2 seconds
> 
> 
> The above was done with only the gentoo repo indexed, plus I need to clean out some old ebuilds in overlays.
> But I was shocked to see the normal -s searches going from 3.4s to 5.5s with your new index.


If you update now, performance should be much better. Specifically:

1) pkg_desc_index_line_read has been optimized to skip validation
(biggest performance problem).

2) IndexedPortdb has been optimized to use a single portdbapi.cp_all
call that covers all of the unindexed repositories.

> Also I did not see a way to specify all repos to be index updated.  It is I believe a requirement of this new system.
> Re-running is for each repo installed individually is something to be
reserved for the new postsync() to do when it
> lands in master.

Yeah, I guess we may as well wait for the plugin-sync branch to get
merged first.
-- 
Thanks,
Zac


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 3/5 v3] Add IndexedPortdb class.
  2014-11-04  5:07     ` [gentoo-portage-dev] [PATCH 3/5 v2] " Zac Medico
@ 2014-11-04 20:34       ` Zac Medico
  0 siblings, 0 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-04 20:34 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

The IndexedPortdb class uses pkg_desc_index to optimize searchs for
package names and descriptions. If the package description index is
missing from a particular repository, then all metadata for that
repository is obtained using the normal pordbapi.aux_get method.

This class only implements a subset of portdbapi functionality that is
useful for searching pkg_desc_index incrementally. For this reason,
the cp_all method returns an ordered iterator instead of a list, so
that search results can be displayed incrementally.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
This updated patch optimizes IndexedPortdb to avoid unnecessary cp_list calls
for repositories that are not indexed. Now IndexedPortdb performs almost as well
as the regular portdbapi for the case where no repositories are indexed.

 pym/portage/dbapi/IndexedPortdb.py | 165 +++++++++++++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 pym/portage/dbapi/IndexedPortdb.py

diff --git a/pym/portage/dbapi/IndexedPortdb.py b/pym/portage/dbapi/IndexedPortdb.py
new file mode 100644
index 0000000..fc431a2
--- /dev/null
+++ b/pym/portage/dbapi/IndexedPortdb.py
@@ -0,0 +1,165 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+import errno
+import io
+import functools
+import operator
+import os
+
+import portage
+from portage import _encodings
+from portage.dep import Atom
+from portage.exception import FileNotFound
+from portage.cache.index.IndexStreamIterator import IndexStreamIterator
+from portage.cache.index.pkg_desc_index import \
+	pkg_desc_index_line_read, pkg_desc_index_node
+from portage.util.iterators.MultiIterGroupBy import MultiIterGroupBy
+from portage.versions import _pkg_str
+
+class IndexedPortdb(object):
+	"""
+	A portdbapi interface that uses a package description index to
+	improve performance. If the description index is missing for a
+	particular repository, then all metadata for that repository is
+	obtained using the normal pordbapi.aux_get method.
+
+	For performance reasons, the match method only supports package
+	name and version constraints. For the same reason, the xmatch
+	method is not implemented.
+	"""
+
+	_copy_attrs = ('cpv_exists', 'findname', 'getFetchMap',
+		'_aux_cache_keys', '_cpv_sort_ascending',
+		'_have_root_eclass_dir')
+
+	def __init__(self, portdb):
+
+		self._portdb = portdb
+
+		for k in self._copy_attrs:
+			setattr(self, k, getattr(portdb, k))
+
+		self._desc_cache = None
+		self._cp_map = None
+		self._unindexed_cp_map = None
+
+	def _init_index(self):
+
+		cp_map = {}
+		desc_cache = {}
+		self._desc_cache = desc_cache
+		self._cp_map = cp_map
+		index_missing = []
+
+		streams = []
+		for repo_path in self._portdb.porttrees:
+			outside_repo = os.path.join(self._portdb.depcachedir,
+				repo_path.lstrip(os.sep))
+			filenames = []
+			for parent_dir in (repo_path, outside_repo):
+				filenames.append(os.path.join(parent_dir,
+					"metadata", "pkg_desc_index"))
+
+			repo_name = self._portdb.getRepositoryName(repo_path)
+
+			try:
+				f = None
+				for filename in filenames:
+					try:
+						f = io.open(filename,
+							encoding=_encodings["repo.content"])
+					except IOError as e:
+						if e.errno not in (errno.ENOENT, errno.ESTALE):
+							raise
+					else:
+						break
+
+				if f is None:
+					raise FileNotFound(filename)
+
+				streams.append(iter(IndexStreamIterator(f,
+					functools.partial(pkg_desc_index_line_read,
+					repo = repo_name))))
+			except FileNotFound:
+				index_missing.append(repo_path)
+
+		if index_missing:
+			self._unindexed_cp_map = {}
+
+			class _NonIndexedStream(object):
+				def __iter__(self_):
+					for cp in self._portdb.cp_all(
+						trees = index_missing):
+						# Don't call cp_list yet, since it's a waste
+						# if the package name does not match the current
+						# search.
+						self._unindexed_cp_map[cp] = index_missing
+						yield pkg_desc_index_node(cp, (), None)
+
+			streams.append(iter(_NonIndexedStream()))
+
+		if streams:
+			if len(streams) == 1:
+				cp_group_iter = ([node] for node in streams[0])
+			else:
+				cp_group_iter = MultiIterGroupBy(streams,
+					key = operator.attrgetter("cp"))
+
+			for cp_group in cp_group_iter:
+
+				new_cp = None
+				cp_list = cp_map.get(cp_group[0].cp)
+				if cp_list is None:
+					new_cp = cp_group[0].cp
+					cp_list = []
+					cp_map[cp_group[0].cp] = cp_list
+
+				for entry in cp_group:
+					cp_list.extend(entry.cpv_list)
+					if entry.desc is not None:
+						for cpv in entry.cpv_list:
+							desc_cache[cpv] = entry.desc
+
+				if new_cp is not None:
+					yield cp_group[0].cp
+
+	def cp_all(self):
+		"""
+		Returns an ordered iterator instead of a list, so that search
+		results can be displayed incrementally.
+		"""
+		if self._cp_map is None:
+			return self._init_index()
+		return iter(sorted(self._cp_map))
+
+	def match(self, atom):
+		"""
+		For performance reasons, only package name and version
+		constraints are supported.
+		"""
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+
+		if self._unindexed_cp_map is not None:
+			try:
+				unindexed = self._unindexed_cp_map.pop(atom.cp)
+			except KeyError:
+				pass
+			else:
+				cp_list.extend(self._portdb.cp_list(atom.cp,
+					mytree = unindexed))
+
+		self._portdb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		if len(attrs) == 1 and attrs[0] == "DESCRIPTION":
+			try:
+				return [self._desc_cache[cpv]]
+			except KeyError:
+				pass
+		return self._portdb.aux_get(cpv, attrs)
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH 5/5 v4] Add emerge --search-index option.
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 5/5] Add emerge --search-index option Zac Medico
  2014-11-01 23:04     ` Zac Medico
@ 2014-11-04 22:09     ` Zac Medico
  1 sibling, 0 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-04 22:09 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

The new emerge --search-index option, which is enabled by default,
causes pkg_desc_index to be used for search optimization. The search
index needs to be regenerated by egencache after changes are made to
a repository (see the --update-pkg-desc-index action).

For users that would like to modify ebuilds in a repository without
running egencache afterwards, emerge --search-index=n can be used to
get non-indexed search. Alternatively, the user could simply remove
the stale index file, in order to disable the search index for a
particular repository.

In order to conserve memory, indices are read as streams, and
MultiIterGroupBy is used to group results from IndexedPortdb and
IndexedVardb. Stream-oriented search also makes it possible to
display search results incrementally (fixing bug #412471).

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
This updates patch optimizes search._iter_search and search.output to use
fewer search._xmatch calls. With this optimization, performance with
IndexedPortdb is nearly indistinguishable from regular portdbapi for the
case where no repositories are indexed.

 man/emerge.1            |   8 +++
 pym/_emerge/actions.py  |   3 +-
 pym/_emerge/depgraph.py |   2 +-
 pym/_emerge/main.py     |   5 ++
 pym/_emerge/search.py   | 137 ++++++++++++++++++++++++++++++------------------
 5 files changed, 103 insertions(+), 52 deletions(-)

diff --git a/man/emerge.1 b/man/emerge.1
index bbe71ac..7bcdd9a 100644
--- a/man/emerge.1
+++ b/man/emerge.1
@@ -796,6 +796,14 @@ If ebuilds using EAPIs which \fIdo not\fR support \fBHDEPEND\fR are built in
 the same \fBemerge\fR run as those using EAPIs which \fIdo\fR support
 \fBHDEPEND\fR, this option affects only the former.
 .TP
+.BR "\-\-search\-index < y | n >"
+Enable or disable indexed search for search actions. This option is
+enabled by default. The search index needs to be regenerated by
+\fBegencache\fR(1) after changes are made to a repository (see the
+\fB\-\-update\-pkg\-desc\-index\fR action). This setting can be added
+to \fBEMERGE_DEFAULT_OPTS\fR (see \fBmake.conf\fR(5)) and later
+overridden via the command line.
+.TP
 .BR "\-\-select [ y | n ] (\-w short option)"
 Add specified packages to the world set (inverse of
 \fB\-\-oneshot\fR). This is useful if you want to
diff --git a/pym/_emerge/actions.py b/pym/_emerge/actions.py
index 48b0826..8a22ab5 100644
--- a/pym/_emerge/actions.py
+++ b/pym/_emerge/actions.py
@@ -2015,7 +2015,8 @@ def action_search(root_config, myopts, myfiles, spinner):
 		searchinstance = search(root_config,
 			spinner, "--searchdesc" in myopts,
 			"--quiet" not in myopts, "--usepkg" in myopts,
-			"--usepkgonly" in myopts)
+			"--usepkgonly" in myopts,
+			search_index = myopts.get("--search-index", "y") != "n")
 		for mysearch in myfiles:
 			try:
 				searchinstance.execute(mysearch)
diff --git a/pym/_emerge/depgraph.py b/pym/_emerge/depgraph.py
index 94eaed8..da408ad 100644
--- a/pym/_emerge/depgraph.py
+++ b/pym/_emerge/depgraph.py
@@ -8656,7 +8656,7 @@ def ambiguous_package_name(arg, atoms, root_config, spinner, myopts):
 
 	s = search(root_config, spinner, "--searchdesc" in myopts,
 		"--quiet" not in myopts, "--usepkg" in myopts,
-		"--usepkgonly" in myopts)
+		"--usepkgonly" in myopts, search_index = False)
 	null_cp = portage.dep_getkey(insert_category_into_atom(
 		arg, "null"))
 	cat, atom_pn = portage.catsplit(null_cp)
diff --git a/pym/_emerge/main.py b/pym/_emerge/main.py
index cf7966c..c08e12a 100644
--- a/pym/_emerge/main.py
+++ b/pym/_emerge/main.py
@@ -616,6 +616,11 @@ def parse_opts(tmpcmdline, silent=False):
 			"choices" :("True", "rdeps")
 		},
 
+		"--search-index": {
+			"help": "Enable or disable indexed search (enabled by default)",
+			"choices": y_or_n
+		},
+
 		"--select": {
 			"shortopt" : "-w",
 			"help"    : "add specified packages to the world set " + \
diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py
index 4b0fd9f..1916afe 100644
--- a/pym/_emerge/search.py
+++ b/pym/_emerge/search.py
@@ -7,9 +7,12 @@ import re
 import portage
 from portage import os
 from portage.dbapi.porttree import _parse_uri_map
+from portage.dbapi.IndexedPortdb import IndexedPortdb
+from portage.dbapi.IndexedVardb import IndexedVardb
 from portage.localization import localized_size
 from portage.output import  bold, bold as white, darkgreen, green, red
 from portage.util import writemsg_stdout
+from portage.util.iterators.MultiIterGroupBy import MultiIterGroupBy
 
 from _emerge.Package import Package
 
@@ -25,15 +28,17 @@ class search(object):
 	# public interface
 	#
 	def __init__(self, root_config, spinner, searchdesc,
-		verbose, usepkg, usepkgonly):
+		verbose, usepkg, usepkgonly, search_index = True):
 		"""Searches the available and installed packages for the supplied search key.
 		The list of available and installed packages is created at object instantiation.
 		This makes successive searches faster."""
 		self.settings = root_config.settings
-		self.vartree = root_config.trees["vartree"]
-		self.spinner = spinner
 		self.verbose = verbose
 		self.searchdesc = searchdesc
+		self.searchkey = None
+		# Disable the spinner since search results are displayed
+		# incrementally.
+		self.spinner = None
 		self.root_config = root_config
 		self.setconfig = root_config.setconfig
 		self.matches = {"pkg" : []}
@@ -45,6 +50,10 @@ class search(object):
 		bindb = root_config.trees["bintree"].dbapi
 		vardb = root_config.trees["vartree"].dbapi
 
+		if search_index:
+			portdb = IndexedPortdb(portdb)
+			vardb = IndexedVardb(vardb)
+
 		if not usepkgonly and portdb._have_root_eclass_dir:
 			self._dbs.append(portdb)
 
@@ -53,16 +62,23 @@ class search(object):
 
 		self._dbs.append(vardb)
 		self._portdb = portdb
+		self._vardb = vardb
 
 	def _spinner_update(self):
 		if self.spinner:
 			self.spinner.update()
 
 	def _cp_all(self):
-		cp_all = set()
+		iterators = []
 		for db in self._dbs:
-			cp_all.update(db.cp_all())
-		return list(sorted(cp_all))
+			i = db.cp_all()
+			try:
+				i = iter(i)
+			except TypeError:
+				pass
+			iterators.append(i)
+		for group in MultiIterGroupBy(iterators):
+			yield group[0]
 
 	def _aux_get(self, *args, **kwargs):
 		for db in self._dbs:
@@ -97,7 +113,7 @@ class search(object):
 		return {}
 
 	def _visible(self, db, cpv, metadata):
-		installed = db is self.vartree.dbapi
+		installed = db is self._vardb
 		built = installed or db is not self._portdb
 		pkg_type = "ebuild"
 		if installed:
@@ -171,8 +187,11 @@ class search(object):
 
 	def execute(self,searchkey):
 		"""Performs the search for the supplied search key"""
+		self.searchkey = searchkey
+
+	def _iter_search(self):
+
 		match_category = 0
-		self.searchkey=searchkey
 		self.packagematches = []
 		if self.searchdesc:
 			self.searchdesc=1
@@ -181,6 +200,7 @@ class search(object):
 			self.searchdesc=0
 			self.matches = {"pkg":[], "set":[]}
 		print("Searching...   ", end=' ')
+		print()
 
 		regexsearch = False
 		if self.searchkey.startswith('%'):
@@ -204,27 +224,26 @@ class search(object):
 
 			masked=0
 			if self.searchre.search(match_string):
-				if not self._xmatch("match-visible", package):
-					masked=1
-				self.matches["pkg"].append([package,masked])
+				yield ("pkg", package)
 			elif self.searchdesc: # DESCRIPTION searching
-				full_package = self._xmatch("bestmatch-visible", package)
+				# Check for DESCRIPTION match first, so that we can skip
+				# the expensive visiblity check if it doesn't match.
+				full_package = self._xmatch("match-all", package)
 				if not full_package:
-					#no match found; we don't want to query description
-					full_package = portage.best(
-						self._xmatch("match-all", package))
-					if not full_package:
-						continue
-					else:
-						masked=1
+					continue
+				full_package = full_package[-1]
 				try:
 					full_desc = self._aux_get(
 						full_package, ["DESCRIPTION"])[0]
 				except KeyError:
-					print("emerge: search: aux_get() failed, skipping")
+					portage.writemsg(
+						"emerge: search: aux_get() failed, skipping\n",
+						noiselevel=-1)
 					continue
-				if self.searchre.search(full_desc):
-					self.matches["desc"].append([full_package,masked])
+				if not self.searchre.search(full_desc):
+					continue
+
+				yield ("desc", package)
 
 		self.sdict = self.setconfig.getSets()
 		for setname in self.sdict:
@@ -235,51 +254,56 @@ class search(object):
 				match_string = setname.split("/")[-1]
 			
 			if self.searchre.search(match_string):
-				self.matches["set"].append([setname, False])
+				yield ("set", setname, False)
 			elif self.searchdesc:
 				if self.searchre.search(
 					self.sdict[setname].getMetadata("DESCRIPTION")):
-					self.matches["set"].append([setname, False])
-			
-		self.mlen=0
-		for mtype in self.matches:
-			self.matches[mtype].sort()
-			self.mlen += len(self.matches[mtype])
+					yield ("set", setname)
 
 	def addCP(self, cp):
 		if not self._xmatch("match-all", cp):
 			return
-		masked = 0
-		if not self._xmatch("bestmatch-visible", cp):
-			masked = 1
-		self.matches["pkg"].append([cp, masked])
+		self.matches["pkg"].append(cp)
 		self.mlen += 1
 
 	def output(self):
 		"""Outputs the results of the search."""
-		msg = []
+
+		class msg(object):
+			@staticmethod
+			def append(msg):
+				writemsg_stdout(msg, noiselevel=-1)
+
 		msg.append("\b\b  \n[ Results for search key : " + \
 			bold(self.searchkey) + " ]\n")
-		msg.append("[ Applications found : " + \
-			bold(str(self.mlen)) + " ]\n\n")
-		vardb = self.vartree.dbapi
+		vardb = self._vardb
 		metadata_keys = set(Package.metadata_keys)
 		metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"])
 		metadata_keys = tuple(metadata_keys)
-		for mtype in self.matches:
-			for match,masked in self.matches[mtype]:
+
+		if self.searchkey is None:
+			# Handle results added via addCP
+			addCP_matches = []
+			for mytype, match in self.matches.items():
+				addCP_matches.append(mytype, match)
+			iterator = iter(addCP_matches)
+
+		else:
+			# Do a normal search
+			iterator = self._iter_search()
+
+		for mtype, match in iterator:
+				self.mlen += 1
+				masked = False
 				full_package = None
-				if mtype == "pkg":
+				if mtype in ("pkg", "desc"):
 					full_package = self._xmatch(
 						"bestmatch-visible", match)
 					if not full_package:
-						#no match found; we don't want to query description
-						masked=1
-						full_package = portage.best(
-							self._xmatch("match-all",match))
-				elif mtype == "desc":
-					full_package = match
-					match        = portage.cpv_getkey(match)
+						masked = True
+						full_package = self._xmatch("match-all", match)
+						if full_package:
+							full_package = full_package[-1]
 				elif mtype == "set":
 					msg.append(green("*") + "  " + bold(match) + "\n")
 					if self.verbose:
@@ -367,12 +391,26 @@ class search(object):
 							+ "   " + desc + "\n")
 						msg.append("      " + darkgreen("License:") + \
 							"       " + license + "\n\n")
-		writemsg_stdout(''.join(msg), noiselevel=-1)
+
+		msg.append("[ Applications found : " + \
+			bold(str(self.mlen)) + " ]\n\n")
+
+		# This method can be called multiple times, so
+		# reset the match count for the next call. Don't
+		# reset it at the beginning of this method, since
+		# that would lose modfications from the addCP
+		# method.
+		self.mlen = 0
+
 	#
 	# private interface
 	#
 	def getInstallationStatus(self,package):
-		installed_package = self.vartree.dep_bestmatch(package)
+		installed_package = self._vardb.match(package)
+		if installed_package:
+			installed_package = installed_package[-1]
+		else:
+			installed_package = ""
 		result = ""
 		version = self.getVersion(installed_package,search.VERSION_RELEASE)
 		if len(version) > 0:
@@ -391,4 +429,3 @@ class search(object):
 		else:
 			result = ""
 		return result
-
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] Re: [PATCH 4/5] Add IndexedVardb class.
  2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 4/5] Add IndexedVardb class Zac Medico
@ 2014-11-05  9:59     ` Zac Medico
  2014-11-07  8:45       ` [gentoo-portage-dev] [PATCH] Log changes between vdb_metadata.pickle updates Zac Medico
  0 siblings, 1 reply; 29+ messages in thread
From: Zac Medico @ 2014-11-05  9:59 UTC (permalink / raw
  To: gentoo-portage-dev

On 11/01/2014 03:46 PM, Zac Medico wrote:
> Searching of installed packages is optimized to take advantage of
> vardbdbapi._aux_cache, which is backed by vdb_metadata.pickle.
> This class only implements a subset of vardbapi functionality that is
> useful for searching incrementally. For this reason, the cp_all method
> returns an ordered iterator instead of a list, so that search results
> can be displayed incrementally.
> 
> X-Gentoo-Bug: 525718
> X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718

I think we should create a vdb index that can be appended to, like a
journal. Then IndexedVardb could just read the index/journal and that
would provide all of the needed data without having to look inside
/var/db/pkg/*.
-- 
Thanks,
Zac


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH] Log changes between vdb_metadata.pickle updates
  2014-11-05  9:59     ` [gentoo-portage-dev] " Zac Medico
@ 2014-11-07  8:45       ` Zac Medico
  2014-11-07 16:51         ` Brian Dolbec
  2014-11-08  9:16         ` [gentoo-portage-dev] [PATCH v2] " Zac Medico
  0 siblings, 2 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-07  8:45 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

This adds add support to generate a vdb_metadata_delta.json file
which tracks package merges / unmerges that occur between updates to
vdb_metadata.pickle. IndexedVardb can use the delta together with
vdb_metadata.pickle to reconstruct a complete view of /var/db/pkg,
so that it can avoid expensive listdir calls in /var/db/pkg/*.
Note that vdb_metadata.pickle is only updated periodically, in
order to avoid excessive re-writes of a large file.

In order to test the performance gains from this patch, you need to
generate /var/cache/edb/vdb_metadata_delta.json first, which will
happen automatically if you run 'emerge -p anything' with root
privileges.
---
 pym/portage/dbapi/IndexedVardb.py |  35 ++++++++-
 pym/portage/dbapi/vartree.py      | 161 +++++++++++++++++++++++++++++++++++---
 2 files changed, 185 insertions(+), 11 deletions(-)

diff --git a/pym/portage/dbapi/IndexedVardb.py b/pym/portage/dbapi/IndexedVardb.py
index 424defc..e225ca1 100644
--- a/pym/portage/dbapi/IndexedVardb.py
+++ b/pym/portage/dbapi/IndexedVardb.py
@@ -3,6 +3,7 @@
 
 import portage
 from portage.dep import Atom
+from portage.exception import InvalidData
 from portage.versions import _pkg_str
 
 class IndexedVardb(object):
@@ -42,7 +43,39 @@ class IndexedVardb(object):
 		if self._cp_map is not None:
 			return iter(sorted(self._cp_map))
 
-		return self._iter_cp_all()
+		cache_delta = self._vardb._cache_delta_load_race()
+		if cache_delta is None:
+			return self._iter_cp_all()
+
+		packages = self._vardb._aux_cache["packages"]
+		for delta in cache_delta["deltas"]:
+			cpv = delta["package"] + "-" + delta["version"]
+			event = delta["event"]
+			if event == "add":
+				# Use aux_get to populate the cache
+				# for this cpv.
+				if cpv not in packages:
+					try:
+						self._vardb.aux_get(cpv, ["DESCRIPTION"])
+					except KeyError:
+						pass
+			elif event == "remove":
+				packages.pop(cpv, None)
+
+		self._cp_map = cp_map = {}
+		for cpv in packages:
+			try:
+				cpv = _pkg_str(cpv)
+			except InvalidData:
+				continue
+
+			cp_list = cp_map.get(cpv.cp)
+			if cp_list is None:
+				cp_list = []
+				cp_map[cpv.cp] = cp_list
+			cp_list.append(cpv)
+
+		return iter(sorted(self._cp_map))
 
 	def _iter_cp_all(self):
 		self._cp_map = cp_map = {}
diff --git a/pym/portage/dbapi/vartree.py b/pym/portage/dbapi/vartree.py
index 6ab4b92..fd4b099 100644
--- a/pym/portage/dbapi/vartree.py
+++ b/pym/portage/dbapi/vartree.py
@@ -76,6 +76,7 @@ import gc
 import grp
 import io
 from itertools import chain
+import json
 import logging
 import os as _os
 import platform
@@ -109,6 +110,7 @@ class vardbapi(dbapi):
 		"|".join(_excluded_dirs) + r')$')
 
 	_aux_cache_version        = "1"
+	_aux_cache_delta_version  = "1"
 	_owners_cache_version     = "1"
 
 	# Number of uncached packages to trigger cache update, since
@@ -177,6 +179,8 @@ class vardbapi(dbapi):
 		self._aux_cache_obj = None
 		self._aux_cache_filename = os.path.join(self._eroot,
 			CACHE_PATH, "vdb_metadata.pickle")
+		self._cache_delta_filename = os.path.join(self._eroot,
+			CACHE_PATH, "vdb_metadata_delta.json")
 		self._counter_path = os.path.join(self._eroot,
 			CACHE_PATH, "counter")
 
@@ -511,6 +515,120 @@ class vardbapi(dbapi):
 		self.cpcache.pop(pkg_dblink.mysplit[0], None)
 		dircache.pop(pkg_dblink.dbcatdir, None)
 
+	def _cache_delta(self, event, cpv, slot, counter):
+
+		self.lock()
+		try:
+			deltas_obj = self._cache_delta_load()
+
+			if deltas_obj is None:
+				# We can't record meaningful deltas without
+				# a pre-existing state.
+				return
+
+			delta_node = {
+				"event": event,
+				"package": cpv.cp,
+				"version": cpv.version,
+				"slot": slot,
+				"counter": "%s" % counter
+			}
+
+			deltas_obj["deltas"].append(delta_node)
+
+			# Eliminate earlier nodes cancelled out by later nodes
+			# that have identical package and slot attributes.
+			filtered_list = []
+			slot_keys = set()
+			version_keys = set()
+			for delta_node in reversed(deltas_obj["deltas"]):
+				slot_key = (delta_node["package"],
+					delta_node["slot"])
+				version_key = (delta_node["package"],
+					delta_node["version"])
+				if not (slot_key in slot_keys or \
+					version_key in version_keys):
+					filtered_list.append(delta_node)
+					slot_keys.add(slot_key)
+					version_keys.add(version_key)
+
+			filtered_list.reverse()
+			deltas_obj["deltas"] = filtered_list
+
+			f = atomic_ofstream(self._cache_delta_filename,
+				mode='w', encoding=_encodings['repo.content'])
+			json.dump(deltas_obj, f, ensure_ascii=False)
+			f.close()
+
+		finally:
+			self.unlock()
+
+	def _cache_delta_load(self):
+
+		if not os.path.exists(self._aux_cache_filename):
+			# If the primary cache doesn't exist yet, then
+			# we can't record a delta against it.
+			return None
+
+		try:
+			with io.open(self._cache_delta_filename, 'r',
+				encoding=_encodings['repo.content'],
+				errors='strict') as f:
+				cache_obj = json.load(f)
+		except EnvironmentError as e:
+			if e.errno not in (errno.ENOENT, errno.ESTALE):
+				raise
+		except (SystemExit, KeyboardInterrupt):
+			raise
+		except Exception:
+			# Corrupt, or not json format.
+			pass
+		else:
+			try:
+				version = cache_obj["version"]
+			except KeyError:
+				pass
+			else:
+				# If the timestamp recorded in the deltas file
+				# doesn't match aux_cache_timestamp, then the
+				# deltas are not valid. This means that deltas
+				# cannot be recorded until after the next
+				# vdb_metadata.pickle update, in order to
+				# guarantee consistency.
+				if version == self._aux_cache_delta_version:
+					try:
+						deltas = cache_obj["deltas"]
+					except KeyError:
+						cache_obj["deltas"] = deltas = []
+
+					if isinstance(deltas, list):
+						return cache_obj
+
+		return None
+
+	def _cache_delta_load_race(self):
+		"""
+		This calls _cache_delta_load and validates the timestamp
+		against the currently loaded _aux_cache. If a concurrent
+		update causes the timestamps to be inconsistent, then
+		it reloads the caches and tries one more time before
+		it aborts. In practice, the race is very unlikely, so
+		this will usually succeed on the first try.
+		"""
+
+		tries = 2
+		while tries:
+			tries -= 1
+			cache_delta = self._cache_delta_load()
+			if cache_delta is not None and \
+				cache_delta.get("timestamp") != \
+				self._aux_cache.get("timestamp", False):
+				self._aux_cache_obj = None
+			else:
+				return cache_delta
+
+		return None
+
 	def match(self, origdep, use_cache=1):
 		"caching match function"
 		mydep = dep_expand(
@@ -556,22 +674,37 @@ class vardbapi(dbapi):
 		long as at least part of the cache is still valid)."""
 		if self._flush_cache_enabled and \
 			self._aux_cache is not None and \
-			len(self._aux_cache["modified"]) >= self._aux_cache_threshold and \
-			secpass >= 2:
+			secpass >= 2 and \
+			(len(self._aux_cache["modified"]) >= self._aux_cache_threshold or
+			not os.path.exists(self._cache_delta_filename)):
+
+			ensure_dirs(os.path.dirname(self._aux_cache_filename))
+
 			self._owners.populate() # index any unindexed contents
 			valid_nodes = set(self.cpv_all())
 			for cpv in list(self._aux_cache["packages"]):
 				if cpv not in valid_nodes:
 					del self._aux_cache["packages"][cpv]
 			del self._aux_cache["modified"]
-			try:
-				f = atomic_ofstream(self._aux_cache_filename, 'wb')
-				pickle.dump(self._aux_cache, f, protocol=2)
-				f.close()
-				apply_secpass_permissions(
-					self._aux_cache_filename, gid=portage_gid, mode=0o644)
-			except (IOError, OSError) as e:
-				pass
+			timestamp = time.time()
+			self._aux_cache["timestamp"] = timestamp
+
+			f = atomic_ofstream(self._aux_cache_filename, 'wb')
+			pickle.dump(self._aux_cache, f, protocol=2)
+			f.close()
+			apply_secpass_permissions(
+				self._aux_cache_filename, mode=0o644)
+
+			f = atomic_ofstream(self._cache_delta_filename, 'w',
+				encoding=_encodings['repo.content'], errors='strict')
+			json.dump({
+				"version": self._aux_cache_delta_version,
+				"timestamp": timestamp
+				}, f, ensure_ascii=False)
+			f.close()
+			apply_secpass_permissions(
+				self._cache_delta_filename, mode=0o644)
+
 			self._aux_cache["modified"] = set()
 
 	@property
@@ -1590,6 +1723,12 @@ class dblink(object):
 				self.dbdir, noiselevel=-1)
 			return
 
+		if self.dbdir is self.dbpkgdir:
+			counter, = self.vartree.dbapi.aux_get(
+				self.mycpv, ["COUNTER"])
+			self.vartree.dbapi._cache_delta("remove", self.mycpv,
+				self.settings["SLOT"].split("/")[0], counter)
+
 		shutil.rmtree(self.dbdir)
 		# If empty, remove parent category directory.
 		try:
@@ -4196,6 +4335,8 @@ class dblink(object):
 			self.delete()
 			_movefile(self.dbtmpdir, self.dbpkgdir, mysettings=self.settings)
 			self._merged_path(self.dbpkgdir, os.lstat(self.dbpkgdir))
+			self.vartree.dbapi._cache_delta("add",
+				self.mycpv, slot, counter)
 		finally:
 			self.unlockdb()
 
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [gentoo-portage-dev] [PATCH] Log changes between vdb_metadata.pickle updates
  2014-11-07  8:45       ` [gentoo-portage-dev] [PATCH] Log changes between vdb_metadata.pickle updates Zac Medico
@ 2014-11-07 16:51         ` Brian Dolbec
  2014-11-07 20:17           ` Zac Medico
  2014-11-08  9:16         ` [gentoo-portage-dev] [PATCH v2] " Zac Medico
  1 sibling, 1 reply; 29+ messages in thread
From: Brian Dolbec @ 2014-11-07 16:51 UTC (permalink / raw
  To: gentoo-portage-dev

On Fri,  7 Nov 2014 00:45:55 -0800
Zac Medico <zmedico@gentoo.org> wrote:

> This adds add support to generate a vdb_metadata_delta.json file
> which tracks package merges / unmerges that occur between updates to
> vdb_metadata.pickle. IndexedVardb can use the delta together with
> vdb_metadata.pickle to reconstruct a complete view of /var/db/pkg,
> so that it can avoid expensive listdir calls in /var/db/pkg/*.
> Note that vdb_metadata.pickle is only updated periodically, in
> order to avoid excessive re-writes of a large file.
> 
> In order to test the performance gains from this patch, you need to
> generate /var/cache/edb/vdb_metadata_delta.json first, which will
> happen automatically if you run 'emerge -p anything' with root
> privileges.
> ---
>  pym/portage/dbapi/IndexedVardb.py |  35 ++++++++-
>  pym/portage/dbapi/vartree.py      | 161
> +++++++++++++++++++++++++++++++++++--- 2 files changed, 185
> insertions(+), 11 deletions(-)
> 
> diff --git a/pym/portage/dbapi/IndexedVardb.py
> b/pym/portage/dbapi/IndexedVardb.py index 424defc..e225ca1 100644
> --- a/pym/portage/dbapi/IndexedVardb.py
> +++ b/pym/portage/dbapi/IndexedVardb.py
> @@ -3,6 +3,7 @@
>  
>  import portage
>  from portage.dep import Atom
> +from portage.exception import InvalidData
>  from portage.versions import _pkg_str
>  
>  class IndexedVardb(object):
> @@ -42,7 +43,39 @@ class IndexedVardb(object):
>  		if self._cp_map is not None:
>  			return iter(sorted(self._cp_map))
>  
> -		return self._iter_cp_all()
> +		cache_delta = self._vardb._cache_delta_load_race()
> +		if cache_delta is None:
> +			return self._iter_cp_all()
> +
> +		packages = self._vardb._aux_cache["packages"]
> +		for delta in cache_delta["deltas"]:
> +			cpv = delta["package"] + "-" +
> delta["version"]
> +			event = delta["event"]
> +			if event == "add":
> +				# Use aux_get to populate the cache
> +				# for this cpv.
> +				if cpv not in packages:
> +					try:
> +
> self._vardb.aux_get(cpv, ["DESCRIPTION"])
> +					except KeyError:
> +						pass
> +			elif event == "remove":
> +				packages.pop(cpv, None)
> +
> +		self._cp_map = cp_map = {}
> +		for cpv in packages:
> +			try:
> +				cpv = _pkg_str(cpv)
> +			except InvalidData:
> +				continue
> +
> +			cp_list = cp_map.get(cpv.cp)
> +			if cp_list is None:
> +				cp_list = []
> +				cp_map[cpv.cp] = cp_list
> +			cp_list.append(cpv)
> +
> +		return iter(sorted(self._cp_map))
>  
>  	def _iter_cp_all(self):
>  		self._cp_map = cp_map = {}

looks good

> diff --git a/pym/portage/dbapi/vartree.py
> b/pym/portage/dbapi/vartree.py index 6ab4b92..fd4b099 100644
> --- a/pym/portage/dbapi/vartree.py
> +++ b/pym/portage/dbapi/vartree.py
> @@ -76,6 +76,7 @@ import gc
>  import grp
>  import io
>  from itertools import chain
> +import json
>  import logging
>  import os as _os
>  import platform
> @@ -109,6 +110,7 @@ class vardbapi(dbapi):
>  		"|".join(_excluded_dirs) + r')$')
>  
>  	_aux_cache_version        = "1"
> +	_aux_cache_delta_version  = "1"
>  	_owners_cache_version     = "1"
>  
>  	# Number of uncached packages to trigger cache update, since
> @@ -177,6 +179,8 @@ class vardbapi(dbapi):
>  		self._aux_cache_obj = None
>  		self._aux_cache_filename = os.path.join(self._eroot,
>  			CACHE_PATH, "vdb_metadata.pickle")
> +		self._cache_delta_filename =
> os.path.join(self._eroot,
> +			CACHE_PATH, "vdb_metadata_delta.json")
>  		self._counter_path = os.path.join(self._eroot,
>  			CACHE_PATH, "counter")
>  
> @@ -511,6 +515,120 @@ class vardbapi(dbapi):
>  		self.cpcache.pop(pkg_dblink.mysplit[0], None)
>  		dircache.pop(pkg_dblink.dbcatdir, None)
>  

The following code I would like to see either as an independant class
and file if possible, then just instantiated here in the main vardbapi.
Looking over the code, I didn't see much use of other class functions.
This class is already too large in many ways.  Also is there a
possibility this code could be re-used as a generic delta cache
anywhere else?

Another possibility is moving this code and the aux_cache code to
another class that the vardbapi class also subclasses.  This would move
all the cache code to a small class easily viewed, edited, maintained.

This file is already 5k+ LOC and primarily the vardbapi class

> +	def _cache_delta(self, event, cpv, slot, counter):
> +
> +		self.lock()
> +		try:
> +			deltas_obj = self._cache_delta_load()
> +
> +			if deltas_obj is None:
> +				# We can't record meaningful deltas
> without
> +				# a pre-existing state.
> +				return
> +
> +			delta_node = {
> +				"event": event,
> +				"package": cpv.cp,
> +				"version": cpv.version,
> +				"slot": slot,
> +				"counter": "%s" % counter
> +			}
> +
> +			deltas_obj["deltas"].append(delta_node)
> +
> +			# Eliminate earlier nodes cancelled out by
> later nodes
> +			# that have identical package and slot
> attributes.
> +			filtered_list = []
> +			slot_keys = set()
> +			version_keys = set()
> +			for delta_node in
> reversed(deltas_obj["deltas"]):
> +				slot_key = (delta_node["package"],
> +					delta_node["slot"])
> +				version_key = (delta_node["package"],
> +					delta_node["version"])
> +				if not (slot_key in slot_keys or \
> +					version_key in version_keys):
> +
> filtered_list.append(delta_node)
> +					slot_keys.add(slot_key)
> +					version_keys.add(version_key)
> +
> +			filtered_list.reverse()
> +			deltas_obj["deltas"] = filtered_list
> +
> +			f =
> atomic_ofstream(self._cache_delta_filename,
> +				mode='w',
> encoding=_encodings['repo.content'])
> +			json.dump(deltas_obj, f, ensure_ascii=False)
> +			f.close()
> +
> +		finally:
> +			self.unlock()
> +
> +	def _cache_delta_load(self):
> +
> +		if not os.path.exists(self._aux_cache_filename):
> +			# If the primary cache doesn't exist yet,
> then
> +			# we can't record a delta against it.
> +			return None
> +
> +		try:
> +			with io.open(self._cache_delta_filename, 'r',
> +				encoding=_encodings['repo.content'],
> +				errors='strict') as f:
> +				cache_obj = json.load(f)
> +		except EnvironmentError as e:
> +			if e.errno not in (errno.ENOENT,
> errno.ESTALE):
> +				raise
> +		except (SystemExit, KeyboardInterrupt):
> +			raise
> +		except Exception:
> +			# Corrupt, or not json format.
> +			pass
> +		else:
> +			try:
> +				version = cache_obj["version"]
> +			except KeyError:
> +				pass
> +			else:
> +				# If the timestamp recorded in the
> deltas file
> +				# doesn't match aux_cache_timestamp,
> then the
> +				# deltas are not valid. This means
> that deltas
> +				# cannot be recorded until after the
> next
> +				# vdb_metadata.pickle update, in
> order to
> +				# guarantee consistency.
> +				if version ==
> self._aux_cache_delta_version:
> +					try:
> +						deltas =
> cache_obj["deltas"]
> +					except KeyError:
> +						cache_obj["deltas"]
> = deltas = [] +
> +					if isinstance(deltas, list):
> +						return cache_obj
> +
> +		return None
> +
> +	def _cache_delta_load_race(self):
> +		"""
> +		This calls _cache_delta_load and validates the
> timestamp
> +		against the currently loaded _aux_cache. If a
> concurrent
> +		update causes the timestamps to be inconsistent, then
> +		it reloads the caches and tries one more time before
> +		it aborts. In practice, the race is very unlikely, so
> +		this will usually succeed on the first try.
> +		"""
> +
> +		tries = 2
> +		while tries:
> +			tries -= 1
> +			cache_delta = self._cache_delta_load()
> +			if cache_delta is not None and \
> +				cache_delta.get("timestamp") != \
> +				self._aux_cache.get("timestamp",
> False):
> +				self._aux_cache_obj = None
> +			else:
> +				return cache_delta
> +
> +		return None
> +
>  	def match(self, origdep, use_cache=1):
>  		"caching match function"
>  		mydep = dep_expand(
> @@ -556,22 +674,37 @@ class vardbapi(dbapi):
>  		long as at least part of the cache is still
> valid).""" if self._flush_cache_enabled and \
>  			self._aux_cache is not None and \
> -			len(self._aux_cache["modified"]) >=
> self._aux_cache_threshold and \
> -			secpass >= 2:
> +			secpass >= 2 and \
> +			(len(self._aux_cache["modified"]) >=
> self._aux_cache_threshold or
> +			not
> os.path.exists(self._cache_delta_filename)): +
> +
> ensure_dirs(os.path.dirname(self._aux_cache_filename)) +
>  			self._owners.populate() # index any
> unindexed contents valid_nodes = set(self.cpv_all())
>  			for cpv in list(self._aux_cache["packages"]):
>  				if cpv not in valid_nodes:
>  					del
> self._aux_cache["packages"][cpv] del self._aux_cache["modified"]
> -			try:
> -				f =
> atomic_ofstream(self._aux_cache_filename, 'wb')
> -				pickle.dump(self._aux_cache, f,
> protocol=2)
> -				f.close()
> -				apply_secpass_permissions(
> -					self._aux_cache_filename,
> gid=portage_gid, mode=0o644)
> -			except (IOError, OSError) as e:
> -				pass
> +			timestamp = time.time()
> +			self._aux_cache["timestamp"] = timestamp
> +
> +			f =
> atomic_ofstream(self._aux_cache_filename, 'wb')
> +			pickle.dump(self._aux_cache, f, protocol=2)
> +			f.close()
> +			apply_secpass_permissions(
> +				self._aux_cache_filename, mode=0o644)
> +
> +			f =
> atomic_ofstream(self._cache_delta_filename, 'w',
> +				encoding=_encodings['repo.content'],
> errors='strict')
> +			json.dump({
> +				"version":
> self._aux_cache_delta_version,
> +				"timestamp": timestamp
> +				}, f, ensure_ascii=False)
> +			f.close()
> +			apply_secpass_permissions(
> +				self._cache_delta_filename,
> mode=0o644) +
>  			self._aux_cache["modified"] = set()
>  
>  	@property
> @@ -1590,6 +1723,12 @@ class dblink(object):
>  				self.dbdir, noiselevel=-1)
>  			return
>  
> +		if self.dbdir is self.dbpkgdir:
> +			counter, = self.vartree.dbapi.aux_get(
> +				self.mycpv, ["COUNTER"])
> +			self.vartree.dbapi._cache_delta("remove",
> self.mycpv,
> +				self.settings["SLOT"].split("/")[0],
> counter) +
>  		shutil.rmtree(self.dbdir)
>  		# If empty, remove parent category directory.
>  		try:
> @@ -4196,6 +4335,8 @@ class dblink(object):
>  			self.delete()
>  			_movefile(self.dbtmpdir, self.dbpkgdir,
> mysettings=self.settings) self._merged_path(self.dbpkgdir,
> os.lstat(self.dbpkgdir))
> +			self.vartree.dbapi._cache_delta("add",
> +				self.mycpv, slot, counter)
>  		finally:
>  			self.unlockdb()
>  



-- 
Brian Dolbec <dolsen>



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [gentoo-portage-dev] [PATCH] Log changes between vdb_metadata.pickle updates
  2014-11-07 16:51         ` Brian Dolbec
@ 2014-11-07 20:17           ` Zac Medico
  0 siblings, 0 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-07 20:17 UTC (permalink / raw
  To: gentoo-portage-dev

On 11/07/2014 08:51 AM, Brian Dolbec wrote:
> On Fri,  7 Nov 2014 00:45:55 -0800
> Zac Medico <zmedico@gentoo.org> wrote:
> 
>> This adds add support to generate a vdb_metadata_delta.json file
>> which tracks package merges / unmerges that occur between updates to
>> vdb_metadata.pickle. IndexedVardb can use the delta together with
>> vdb_metadata.pickle to reconstruct a complete view of /var/db/pkg,
>> so that it can avoid expensive listdir calls in /var/db/pkg/*.
>> Note that vdb_metadata.pickle is only updated periodically, in
>> order to avoid excessive re-writes of a large file.
>>
>> In order to test the performance gains from this patch, you need to
>> generate /var/cache/edb/vdb_metadata_delta.json first, which will
>> happen automatically if you run 'emerge -p anything' with root
>> privileges.
>> ---
> 
> The following code I would like to see either as an independant class
> and file if possible, then just instantiated here in the main vardbapi.
> Looking over the code, I didn't see much use of other class functions.
> This class is already too large in many ways.

Yeah, I definitely want to split it out.

> Also is there a
> possibility this code could be re-used as a generic delta cache
> anywhere else?

Maybe. For example, the PreservedLibsRegistry and WorldSelectedSet
classes both have similarities in the way that encapsulate an on-disk
data store and manage concurrency. Maybe I'll create a helper class that
can be utilized by these classes to manage concurrency with on-disk data
stores.

> Another possibility is moving this code and the aux_cache code to
> another class that the vardbapi class also subclasses.  This would move
> all the cache code to a small class easily viewed, edited, maintained.

In this case, I think a helper class will work just fine, so there will
be no need for inheritance.
-- 
Thanks,
Zac


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [gentoo-portage-dev] [PATCH v2] Log changes between vdb_metadata.pickle updates
  2014-11-07  8:45       ` [gentoo-portage-dev] [PATCH] Log changes between vdb_metadata.pickle updates Zac Medico
  2014-11-07 16:51         ` Brian Dolbec
@ 2014-11-08  9:16         ` Zac Medico
  1 sibling, 0 replies; 29+ messages in thread
From: Zac Medico @ 2014-11-08  9:16 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

This adds add support to generate a vdb_metadata_delta.json file
which tracks package merges / unmerges that occur between updates to
vdb_metadata.pickle. IndexedVardb can use the delta together with
vdb_metadata.pickle to reconstruct a complete view of /var/db/pkg,
so that it can avoid expensive listdir calls in /var/db/pkg/*.
Note that vdb_metadata.pickle is only updated periodically, in
order to avoid excessive re-writes of a large file.

In order to test the performance gains from this patch, you need to
generate /var/cache/edb/vdb_metadata_delta.json first, which will
happen automatically if you run 'emerge -p anything' with root
privileges.
---
This updated patch splits out a VdbMetadataDelta class into a new file, reducing
vartree.py bloat.

 pym/portage/dbapi/IndexedVardb.py     |  22 ++++-
 pym/portage/dbapi/VdbMetadataDelta.py | 156 ++++++++++++++++++++++++++++++++++
 pym/portage/dbapi/vartree.py          |  42 ++++++---
 3 files changed, 209 insertions(+), 11 deletions(-)
 create mode 100644 pym/portage/dbapi/VdbMetadataDelta.py

diff --git a/pym/portage/dbapi/IndexedVardb.py b/pym/portage/dbapi/IndexedVardb.py
index 424defc..38bfeed 100644
--- a/pym/portage/dbapi/IndexedVardb.py
+++ b/pym/portage/dbapi/IndexedVardb.py
@@ -3,6 +3,7 @@
 
 import portage
 from portage.dep import Atom
+from portage.exception import InvalidData
 from portage.versions import _pkg_str
 
 class IndexedVardb(object):
@@ -42,7 +43,26 @@ class IndexedVardb(object):
 		if self._cp_map is not None:
 			return iter(sorted(self._cp_map))
 
-		return self._iter_cp_all()
+		delta_data = self._vardb._cache_delta.loadRace()
+		if delta_data is None:
+			return self._iter_cp_all()
+
+		self._vardb._cache_delta.applyDelta(delta_data)
+
+		self._cp_map = cp_map = {}
+		for cpv in self._vardb._aux_cache["packages"]:
+			try:
+				cpv = _pkg_str(cpv)
+			except InvalidData:
+				continue
+
+			cp_list = cp_map.get(cpv.cp)
+			if cp_list is None:
+				cp_list = []
+				cp_map[cpv.cp] = cp_list
+			cp_list.append(cpv)
+
+		return iter(sorted(self._cp_map))
 
 	def _iter_cp_all(self):
 		self._cp_map = cp_map = {}
diff --git a/pym/portage/dbapi/VdbMetadataDelta.py b/pym/portage/dbapi/VdbMetadataDelta.py
new file mode 100644
index 0000000..e6a5c47
--- /dev/null
+++ b/pym/portage/dbapi/VdbMetadataDelta.py
@@ -0,0 +1,156 @@
+# Copyright 2014 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+import errno
+import io
+import json
+import os
+
+from portage import _encodings
+from portage.util import atomic_ofstream
+
+class VdbMetadataDelta(object):
+
+	_format_version  = "1"
+
+	def __init__(self, vardb):
+		self._vardb = vardb
+
+	def initialize(self, timestamp):
+		f = atomic_ofstream(self._vardb._cache_delta_filename, 'w',
+			encoding=_encodings['repo.content'], errors='strict')
+		json.dump({
+			"version": self._format_version,
+			"timestamp": timestamp
+			}, f, ensure_ascii=False)
+		f.close()
+
+	def load(self):
+
+		if not os.path.exists(self._vardb._aux_cache_filename):
+			# If the primary cache doesn't exist yet, then
+			# we can't record a delta against it.
+			return None
+
+		try:
+			with io.open(self._vardb._cache_delta_filename, 'r',
+				encoding=_encodings['repo.content'],
+				errors='strict') as f:
+				cache_obj = json.load(f)
+		except EnvironmentError as e:
+			if e.errno not in (errno.ENOENT, errno.ESTALE):
+				raise
+		except (SystemExit, KeyboardInterrupt):
+			raise
+		except Exception:
+			# Corrupt, or not json format.
+			pass
+		else:
+			try:
+				version = cache_obj["version"]
+			except KeyError:
+				pass
+			else:
+				# If the timestamp recorded in the deltas file
+				# doesn't match aux_cache_timestamp, then the
+				# deltas are not valid. This means that deltas
+				# cannot be recorded until after the next
+				# vdb_metadata.pickle update, in order to
+				# guarantee consistency.
+				if version == self._format_version:
+					try:
+						deltas = cache_obj["deltas"]
+					except KeyError:
+						cache_obj["deltas"] = deltas = []
+
+					if isinstance(deltas, list):
+						return cache_obj
+
+		return None
+
+	def loadRace(self):
+		"""
+		This calls self.load() and validates the timestamp
+		against the currently loaded self._vardb._aux_cache. If a
+		concurrent update causes the timestamps to be inconsistent,
+		then it reloads the caches and tries one more time before
+		it aborts. In practice, the race is very unlikely, so
+		this will usually succeed on the first try.
+		"""
+
+		tries = 2
+		while tries:
+			tries -= 1
+			cache_delta = self.load()
+			if cache_delta is not None and \
+				cache_delta.get("timestamp") != \
+				self._vardb._aux_cache.get("timestamp", False):
+				self._vardb._aux_cache_obj = None
+			else:
+				return cache_delta
+
+		return None
+
+	def recordEvent(self, event, cpv, slot, counter):
+
+		self._vardb.lock()
+		try:
+			deltas_obj = self.load()
+
+			if deltas_obj is None:
+				# We can't record meaningful deltas without
+				# a pre-existing state.
+				return
+
+			delta_node = {
+				"event": event,
+				"package": cpv.cp,
+				"version": cpv.version,
+				"slot": slot,
+				"counter": "%s" % counter
+			}
+
+			deltas_obj["deltas"].append(delta_node)
+
+			# Eliminate earlier nodes cancelled out by later nodes
+			# that have identical package and slot attributes.
+			filtered_list = []
+			slot_keys = set()
+			version_keys = set()
+			for delta_node in reversed(deltas_obj["deltas"]):
+				slot_key = (delta_node["package"],
+					delta_node["slot"])
+				version_key = (delta_node["package"],
+					delta_node["version"])
+				if not (slot_key in slot_keys or \
+					version_key in version_keys):
+					filtered_list.append(delta_node)
+					slot_keys.add(slot_key)
+					version_keys.add(version_key)
+
+			filtered_list.reverse()
+			deltas_obj["deltas"] = filtered_list
+
+			f = atomic_ofstream(self._vardb._cache_delta_filename,
+				mode='w', encoding=_encodings['repo.content'])
+			json.dump(deltas_obj, f, ensure_ascii=False)
+			f.close()
+
+		finally:
+			self._vardb.unlock()
+
+	def applyDelta(self, data):
+		packages = self._vardb._aux_cache["packages"]
+		for delta in data["deltas"]:
+			cpv = delta["package"] + "-" + delta["version"]
+			event = delta["event"]
+			if event == "add":
+				# Use aux_get to populate the cache
+				# for this cpv.
+				if cpv not in packages:
+					try:
+						self._vardb.aux_get(cpv, ["DESCRIPTION"])
+					except KeyError:
+						pass
+			elif event == "remove":
+				packages.pop(cpv, None)
diff --git a/pym/portage/dbapi/vartree.py b/pym/portage/dbapi/vartree.py
index 6ab4b92..e0cd5f1 100644
--- a/pym/portage/dbapi/vartree.py
+++ b/pym/portage/dbapi/vartree.py
@@ -63,6 +63,7 @@ from portage import _os_merge
 from portage import _selinux_merge
 from portage import _unicode_decode
 from portage import _unicode_encode
+from .VdbMetadataDelta import VdbMetadataDelta
 
 from _emerge.EbuildBuildDir import EbuildBuildDir
 from _emerge.EbuildPhase import EbuildPhase
@@ -177,6 +178,9 @@ class vardbapi(dbapi):
 		self._aux_cache_obj = None
 		self._aux_cache_filename = os.path.join(self._eroot,
 			CACHE_PATH, "vdb_metadata.pickle")
+		self._cache_delta_filename = os.path.join(self._eroot,
+			CACHE_PATH, "vdb_metadata_delta.json")
+		self._cache_delta = VdbMetadataDelta(self)
 		self._counter_path = os.path.join(self._eroot,
 			CACHE_PATH, "counter")
 
@@ -556,22 +560,31 @@ class vardbapi(dbapi):
 		long as at least part of the cache is still valid)."""
 		if self._flush_cache_enabled and \
 			self._aux_cache is not None and \
-			len(self._aux_cache["modified"]) >= self._aux_cache_threshold and \
-			secpass >= 2:
+			secpass >= 2 and \
+			(len(self._aux_cache["modified"]) >= self._aux_cache_threshold or
+			not os.path.exists(self._cache_delta_filename)):
+
+			ensure_dirs(os.path.dirname(self._aux_cache_filename))
+
 			self._owners.populate() # index any unindexed contents
 			valid_nodes = set(self.cpv_all())
 			for cpv in list(self._aux_cache["packages"]):
 				if cpv not in valid_nodes:
 					del self._aux_cache["packages"][cpv]
 			del self._aux_cache["modified"]
-			try:
-				f = atomic_ofstream(self._aux_cache_filename, 'wb')
-				pickle.dump(self._aux_cache, f, protocol=2)
-				f.close()
-				apply_secpass_permissions(
-					self._aux_cache_filename, gid=portage_gid, mode=0o644)
-			except (IOError, OSError) as e:
-				pass
+			timestamp = time.time()
+			self._aux_cache["timestamp"] = timestamp
+
+			f = atomic_ofstream(self._aux_cache_filename, 'wb')
+			pickle.dump(self._aux_cache, f, protocol=2)
+			f.close()
+			apply_secpass_permissions(
+				self._aux_cache_filename, mode=0o644)
+
+			self._cache_delta.initialize(timestamp)
+			apply_secpass_permissions(
+				self._cache_delta_filename, mode=0o644)
+
 			self._aux_cache["modified"] = set()
 
 	@property
@@ -1590,6 +1603,13 @@ class dblink(object):
 				self.dbdir, noiselevel=-1)
 			return
 
+		if self.dbdir is self.dbpkgdir:
+			counter, = self.vartree.dbapi.aux_get(
+				self.mycpv, ["COUNTER"])
+			self.vartree.dbapi._cache_delta.recordEvent(
+				"remove", self.mycpv,
+				self.settings["SLOT"].split("/")[0], counter)
+
 		shutil.rmtree(self.dbdir)
 		# If empty, remove parent category directory.
 		try:
@@ -4196,6 +4216,8 @@ class dblink(object):
 			self.delete()
 			_movefile(self.dbtmpdir, self.dbpkgdir, mysettings=self.settings)
 			self._merged_path(self.dbpkgdir, os.lstat(self.dbpkgdir))
+			self.vartree.dbapi._cache_delta.recordEvent(
+				"add", self.mycpv, slot, counter)
 		finally:
 			self.unlockdb()
 
-- 
2.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2014-11-08  9:16 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-10-18  3:28 [gentoo-portage-dev] [PATCH] emerge --search: use description index Zac Medico
2014-10-18  5:59 ` [gentoo-portage-dev] " Zac Medico
2014-10-19 21:51   ` Zac Medico
2014-10-23  8:55     ` Brian Dolbec
2014-10-23  9:22       ` Zac Medico
2014-11-01  6:15         ` Zac Medico
2014-11-01 22:46 ` [gentoo-portage-dev] Zac Medico
2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 1/5] Add egencache --update-pkg-desc-index action Zac Medico
2014-11-04  9:03     ` [gentoo-portage-dev] [PATCH 1/5 v2] " Zac Medico
2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 2/5] Add IndexStreamIterator and MultiIterGroupBy Zac Medico
2014-11-02  0:18     ` Zac Medico
2014-11-02 22:50     ` [gentoo-portage-dev] [PATCH 2/5 v3] " Zac Medico
2014-11-03  3:07     ` [gentoo-portage-dev] [PATCH 2/5 v4] " Zac Medico
2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 3/5] Add IndexedPortdb class Zac Medico
2014-11-04  5:07     ` [gentoo-portage-dev] [PATCH 3/5 v2] " Zac Medico
2014-11-04 20:34       ` [gentoo-portage-dev] [PATCH 3/5 v3] " Zac Medico
2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 4/5] Add IndexedVardb class Zac Medico
2014-11-05  9:59     ` [gentoo-portage-dev] " Zac Medico
2014-11-07  8:45       ` [gentoo-portage-dev] [PATCH] Log changes between vdb_metadata.pickle updates Zac Medico
2014-11-07 16:51         ` Brian Dolbec
2014-11-07 20:17           ` Zac Medico
2014-11-08  9:16         ` [gentoo-portage-dev] [PATCH v2] " Zac Medico
2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 5/5] Add emerge --search-index option Zac Medico
2014-11-01 23:04     ` Zac Medico
2014-11-04  5:42       ` [gentoo-portage-dev] [PATCH 5/5 v3] " Zac Medico
2014-11-04  9:10         ` [gentoo-portage-dev] " Zac Medico
2014-11-04 22:09     ` [gentoo-portage-dev] [PATCH 5/5 v4] " Zac Medico
2014-11-03 21:42   ` [gentoo-portage-dev] Brian Dolbec
2014-11-04  9:19     ` [gentoo-portage-dev] Zac Medico

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox