public inbox for gentoo-portage-dev@lists.gentoo.org
 help / color / mirror / Atom feed
* [gentoo-portage-dev] [PATCH] emerge --search: use description index
@ 2014-10-18  3:28 Zac Medico
  2014-10-18  5:59 ` [gentoo-portage-dev] " Zac Medico
  2014-11-01 22:46 ` [gentoo-portage-dev] Zac Medico
  0 siblings, 2 replies; 29+ messages in thread
From: Zac Medico @ 2014-10-18  3:28 UTC (permalink / raw
  To: gentoo-portage-dev

This adds an egencache --update-pkg-desc-index action which generates
a plain-text index of package names, versions, and descriptions. The
index can then be used to optimize emerge --search / --searchdesc
actions. If the package description index is missing from a particular
repository, then all metadata for that repository is obtained using the
normal pordbapi.aux_get method.

Searching of installed packages is optimized to take advantage of
vardbdbapi._aux_cache, which is backed by vardb_metadata.pickle.
See the IndexedVardb docstring some more details.

X-Gentoo-Bug: 525718
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
---
 bin/egencache         |  43 ++++++++++-
 man/egencache.1       |   4 ++
 man/portage.5         |   6 ++
 pym/_emerge/search.py | 196 ++++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 232 insertions(+), 17 deletions(-)

diff --git a/bin/egencache b/bin/egencache
index e366058..90d5e68 100755
--- a/bin/egencache
+++ b/bin/egencache
@@ -57,7 +57,7 @@ from portage.util._async.run_main_scheduler import run_main_scheduler
 from portage.util._eventloop.global_event_loop import global_event_loop
 from portage import cpv_getkey
 from portage.dep import Atom, isjustname
-from portage.versions import pkgsplit, vercmp
+from portage.versions import pkgsplit, vercmp, _pkg_str
 
 try:
 	from xml.etree import ElementTree
@@ -91,6 +91,9 @@ def parse_args(args):
 	actions.add_argument("--update-changelogs",
 		action="store_true",
 		help="update the ChangeLog files from SCM logs")
+	actions.add_argument("--update-pkg-desc-index",
+		action="store_true",
+		help="update package description index")
 	actions.add_argument("--update-manifests",
 		action="store_true",
 		help="update manifests")
@@ -451,6 +454,35 @@ class GenCache(object):
 		if hasattr(trg_cache, '_prune_empty_dirs'):
 			trg_cache._prune_empty_dirs()
 
+class GenPkgDescIndex(object):
+	def __init__(self, portdb, output_file):
+		self.returncode = os.EX_OK
+		self._portdb = portdb
+		self._output_file = output_file
+
+	def run(self):
+
+		portage.util.ensure_dirs(os.path.dirname(self._output_file))
+		f = portage.util.atomic_ofstream(self._output_file,
+			encoding=_encodings["repo.content"])
+
+		portdb = self._portdb
+		for cp in portdb.cp_all():
+			pkgs = portdb.cp_list(cp)
+			if not pkgs:
+				continue
+			desc, = portdb.aux_get(pkgs[-1], ["DESCRIPTION"])
+
+			if len(pkgs) == 1:
+				output = "%s: %s\n" % (pkgs[0], desc)
+			else:
+				output = "%s,%s: %s\n" % (pkgs[0],
+					",".join(_pkg_str(cpv).version
+					for cpv in pkgs[1:]), desc)
+			f.write(output)
+
+		f.close()
+
 class GenUseLocalDesc(object):
 	def __init__(self, portdb, output=None,
 			preserve_comments=False):
@@ -893,7 +925,8 @@ def egencache_main(args):
 			local_config=False, env=env)
 
 	if not (options.update or options.update_use_local_desc or
-			options.update_changelogs or options.update_manifests):
+			options.update_changelogs or options.update_manifests or
+			options.update_pkg_desc_index):
 		parser.error('No action specified')
 		return 1
 
@@ -1057,6 +1090,12 @@ def egencache_main(args):
 		else:
 			ret.append(scheduler.returncode)
 
+	if options.update_pkg_desc_index:
+		gen_index = GenPkgDescIndex(portdb, os.path.join(
+			repo_config.location, "metadata", "pkg_desc_index"))
+		gen_index.run()
+		ret.append(gen_index.returncode)
+
 	if options.update_use_local_desc:
 		gen_desc = GenUseLocalDesc(portdb,
 			output=options.uld_output,
diff --git a/man/egencache.1 b/man/egencache.1
index f71feb3..3a3197f 100644
--- a/man/egencache.1
+++ b/man/egencache.1
@@ -19,6 +19,10 @@ for the details on package atom syntax.
 .BR "\-\-update\-changelogs"
 Update the ChangeLog files from SCM logs (supported only in git repos).
 .TP
+.BR "\-\-update\-pkg\-desc\-index"
+Update the package description index which is located at
+\fImetadata/pkg_desc_index\fR in the repository.
+.TP
 .BR "\-\-update\-use\-local\-desc"
 Update the \fIprofiles/use.local.desc\fR file from metadata.xml.
 .TP
diff --git a/man/portage.5 b/man/portage.5
index e399f0f..26856d1 100644
--- a/man/portage.5
+++ b/man/portage.5
@@ -75,6 +75,7 @@ user\-defined package sets
 .BR /usr/portage/metadata/
 .nf
 layout.conf
+pkg_desc_index
 .fi
 .TP
 .BR /usr/portage/profiles/
@@ -1110,6 +1111,11 @@ cache\-formats = md5-dict pms
 profile\-formats = portage-2
 .fi
 .RE
+.TP
+.BR pkg_desc_index
+This is an index of packages and descriptions which may be generated
+by \fBegencache\fR(1) in order to optimize \fBemerge\fR(1) search
+actions.
 .RE
 .TP
 .BR /usr/portage/profiles/
diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py
index 4b0fd9f..bf15f11 100644
--- a/pym/_emerge/search.py
+++ b/pym/_emerge/search.py
@@ -3,13 +3,17 @@
 
 from __future__ import print_function
 
+import io
 import re
 import portage
-from portage import os
+from portage import os, _encodings
 from portage.dbapi.porttree import _parse_uri_map
+from portage.dep import Atom
+from portage.exception import InvalidData
 from portage.localization import localized_size
 from portage.output import  bold, bold as white, darkgreen, green, red
 from portage.util import writemsg_stdout
+from portage.versions import _pkg_str
 
 from _emerge.Package import Package
 
@@ -30,7 +34,6 @@ class search(object):
 		The list of available and installed packages is created at object instantiation.
 		This makes successive searches faster."""
 		self.settings = root_config.settings
-		self.vartree = root_config.trees["vartree"]
 		self.spinner = spinner
 		self.verbose = verbose
 		self.searchdesc = searchdesc
@@ -41,9 +44,9 @@ class search(object):
 
 		self._dbs = []
 
-		portdb = root_config.trees["porttree"].dbapi
+		portdb = IndexedPortdb(root_config.trees["porttree"].dbapi)
 		bindb = root_config.trees["bintree"].dbapi
-		vardb = root_config.trees["vartree"].dbapi
+		vardb = IndexedVardb(root_config.trees["vartree"].dbapi)
 
 		if not usepkgonly and portdb._have_root_eclass_dir:
 			self._dbs.append(portdb)
@@ -53,6 +56,7 @@ class search(object):
 
 		self._dbs.append(vardb)
 		self._portdb = portdb
+		self._vardb = vardb
 
 	def _spinner_update(self):
 		if self.spinner:
@@ -97,7 +101,7 @@ class search(object):
 		return {}
 
 	def _visible(self, db, cpv, metadata):
-		installed = db is self.vartree.dbapi
+		installed = db is self._vardb
 		built = installed or db is not self._portdb
 		pkg_type = "ebuild"
 		if installed:
@@ -208,6 +212,20 @@ class search(object):
 					masked=1
 				self.matches["pkg"].append([package,masked])
 			elif self.searchdesc: # DESCRIPTION searching
+				# Check for DESCRIPTION match first, so that we can skip
+				# the expensive visiblity check if it doesn't match.
+				full_package = portage.best(
+					self._xmatch("match-all", package))
+				try:
+					full_desc = self._aux_get(
+						full_package, ["DESCRIPTION"])[0]
+				except KeyError:
+					portage.writemsg(
+						"emerge: search: aux_get() failed, skipping\n",
+						noiselevel=-1)
+					continue
+				if not self.searchre.search(full_desc):
+					continue
 				full_package = self._xmatch("bestmatch-visible", package)
 				if not full_package:
 					#no match found; we don't want to query description
@@ -217,14 +235,8 @@ class search(object):
 						continue
 					else:
 						masked=1
-				try:
-					full_desc = self._aux_get(
-						full_package, ["DESCRIPTION"])[0]
-				except KeyError:
-					print("emerge: search: aux_get() failed, skipping")
-					continue
-				if self.searchre.search(full_desc):
-					self.matches["desc"].append([full_package,masked])
+
+				self.matches["desc"].append((full_package, masked))
 
 		self.sdict = self.setconfig.getSets()
 		for setname in self.sdict:
@@ -262,7 +274,7 @@ class search(object):
 			bold(self.searchkey) + " ]\n")
 		msg.append("[ Applications found : " + \
 			bold(str(self.mlen)) + " ]\n\n")
-		vardb = self.vartree.dbapi
+		vardb = self._vardb
 		metadata_keys = set(Package.metadata_keys)
 		metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"])
 		metadata_keys = tuple(metadata_keys)
@@ -372,7 +384,11 @@ class search(object):
 	# private interface
 	#
 	def getInstallationStatus(self,package):
-		installed_package = self.vartree.dep_bestmatch(package)
+		installed_package = self._vardb.match(package)
+		if installed_package:
+			installed_package = installed_package[-1]
+		else:
+			installed_package = ""
 		result = ""
 		version = self.getVersion(installed_package,search.VERSION_RELEASE)
 		if len(version) > 0:
@@ -392,3 +408,153 @@ class search(object):
 			result = ""
 		return result
 
+
+class IndexedPortdb(object):
+	"""
+	A portdbapi interface that uses a package description index to
+	improve performance. If the description index is missing for a
+	particular repository, then all metadata for that repository is
+	obtained using the normal pordbapi.aux_get method.
+	"""
+	def __init__(self, portdb):
+		self._portdb = portdb
+		self.cpv_exists = portdb.cpv_exists
+		self.getFetchMap = portdb.getFetchMap
+		self.findname = portdb.findname
+		self._aux_cache_keys = portdb._aux_cache_keys
+		self._have_root_eclass_dir = portdb._have_root_eclass_dir
+		self._cpv_sort_ascending = portdb._cpv_sort_ascending
+		self._desc_cache = None
+		self._cp_map = None
+
+	def _init_index(self):
+		cp_map = {}
+		desc_cache = {}
+		for repo_path in self._portdb.porttrees:
+			outside_repo = os.path.join(self._portdb.depcachedir,
+				repo_path.lstrip(os.sep))
+			for parent_dir in (repo_path, outside_repo):
+				file_path = os.path.join(parent_dir,
+					"metadata", "pkg_desc_index")
+
+				try:
+					with io.open(file_path,
+						encoding=_encodings["repo.content"]) as f:
+						for line in f:
+							pkgs, desc = line.split(":", 1)
+							desc = desc.strip()
+							pkgs = pkgs.split(",")
+							if not pkgs[0]:
+								continue
+							try:
+								pkg = _pkg_str(pkgs[0])
+							except InvalidData:
+								continue
+							cp_list = cp_map.get(pkg.cp)
+							if cp_list is None:
+								cp_list = []
+								cp_map[pkg.cp] = cp_list
+							cp_list.append(pkg)
+							for ver in pkgs[1:]:
+								try:
+									cp_list.append(
+										_pkg_str(pkg.cp + "-" + ver))
+								except InvalidData:
+									pass
+							for cpv in cp_list:
+								desc_cache[cpv] = desc
+				except IOError:
+					pass
+				else:
+					break
+			else:
+				# No descriptions index was found, so populate
+				# cp_map the slow way.
+				for cp in self._portdb.cp_all(trees=[repo_path]):
+					cp_list = cp_map.get(cp)
+					if cp_list is None:
+						cp_list = []
+						cp_map[cp] = cp_list
+					for cpv in self._portdb.cp_list(cp, mytree=repo_path):
+						if cpv not in cp_list:
+							cp_list.append(_pkg_str(cpv))
+
+		self._desc_cache = desc_cache
+		self._cp_map = cp_map
+
+	def cp_all(self):
+		if self._cp_map is None:
+			self._init_index()
+		return list(self._cp_map)
+
+	def match(self, atom):
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+		self._portdb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		if len(attrs) == 1 and attrs[0] == "DESCRIPTION":
+			try:
+				return [self._desc_cache[cpv]]
+			except KeyError:
+				pass
+		return self._portdb.aux_get(cpv, attrs)
+
+
+class IndexedVardb(object):
+	"""
+	A vardbapi interface that sacrifices validation in order to
+	improve performance. It takes advantage of vardbdbapi._aux_cache,
+	which is backed by vardb_metadata.pickle. Since _aux_cache is
+	not updated for every single merge/unmerge (see
+	_aux_cache_threshold), the list of packages is obtained directly
+	from the real vardbapi instance. If a package is missing from
+	_aux_cache, then its metadata is obtained using the normal
+	(validated) vardbapi.aux_get method.
+	"""
+	def __init__(self, vardb):
+		self._vardb = vardb
+		self._aux_cache_keys = vardb._aux_cache_keys
+		self._cpv_sort_ascending = vardb._cpv_sort_ascending
+		self._cp_map = {}
+		self.cpv_exists = vardb.cpv_exists
+
+	def cp_all(self):
+		if self._cp_map:
+			return list(self._cp_map)
+		cp_map = self._cp_map
+		for cpv in self._vardb.cpv_all():
+			cp = portage.cpv_getkey(cpv)
+			if cp is not None:
+				cp_list = cp_map.get(cp)
+				if cp_list is None:
+					cp_list = []
+					cp_map[cp] = cp_list
+				cp_list.append(_pkg_str(cpv))
+		return list(cp_map)
+
+	def match(self, atom):
+		if not isinstance(atom, Atom):
+			atom = Atom(atom)
+		cp_list = self._cp_map.get(atom.cp)
+		if cp_list is None:
+			return []
+		self._vardb._cpv_sort_ascending(cp_list)
+		return portage.match_from_list(atom, cp_list)
+
+	def aux_get(self, cpv, attrs, myrepo = None):
+		pkg_data = self._vardb._aux_cache["packages"].get(cpv)
+		if not isinstance(pkg_data, tuple) or \
+			len(pkg_data) != 2 or \
+			not isinstance(pkg_data[1], dict):
+			pkg_data = None
+		if pkg_data is None:
+			# It may be missing from _aux_cache due to
+			# _aux_cache_threshold.
+			return self._vardb.aux_get(cpv, attrs)
+		metadata = pkg_data[1]
+		return [metadata.get(k, "") for k in attrs]
-- 
2.0.4


^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2014-11-08  9:16 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-10-18  3:28 [gentoo-portage-dev] [PATCH] emerge --search: use description index Zac Medico
2014-10-18  5:59 ` [gentoo-portage-dev] " Zac Medico
2014-10-19 21:51   ` Zac Medico
2014-10-23  8:55     ` Brian Dolbec
2014-10-23  9:22       ` Zac Medico
2014-11-01  6:15         ` Zac Medico
2014-11-01 22:46 ` [gentoo-portage-dev] Zac Medico
2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 1/5] Add egencache --update-pkg-desc-index action Zac Medico
2014-11-04  9:03     ` [gentoo-portage-dev] [PATCH 1/5 v2] " Zac Medico
2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 2/5] Add IndexStreamIterator and MultiIterGroupBy Zac Medico
2014-11-02  0:18     ` Zac Medico
2014-11-02 22:50     ` [gentoo-portage-dev] [PATCH 2/5 v3] " Zac Medico
2014-11-03  3:07     ` [gentoo-portage-dev] [PATCH 2/5 v4] " Zac Medico
2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 3/5] Add IndexedPortdb class Zac Medico
2014-11-04  5:07     ` [gentoo-portage-dev] [PATCH 3/5 v2] " Zac Medico
2014-11-04 20:34       ` [gentoo-portage-dev] [PATCH 3/5 v3] " Zac Medico
2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 4/5] Add IndexedVardb class Zac Medico
2014-11-05  9:59     ` [gentoo-portage-dev] " Zac Medico
2014-11-07  8:45       ` [gentoo-portage-dev] [PATCH] Log changes between vdb_metadata.pickle updates Zac Medico
2014-11-07 16:51         ` Brian Dolbec
2014-11-07 20:17           ` Zac Medico
2014-11-08  9:16         ` [gentoo-portage-dev] [PATCH v2] " Zac Medico
2014-11-01 22:46   ` [gentoo-portage-dev] [PATCH 5/5] Add emerge --search-index option Zac Medico
2014-11-01 23:04     ` Zac Medico
2014-11-04  5:42       ` [gentoo-portage-dev] [PATCH 5/5 v3] " Zac Medico
2014-11-04  9:10         ` [gentoo-portage-dev] " Zac Medico
2014-11-04 22:09     ` [gentoo-portage-dev] [PATCH 5/5 v4] " Zac Medico
2014-11-03 21:42   ` [gentoo-portage-dev] Brian Dolbec
2014-11-04  9:19     ` [gentoo-portage-dev] Zac Medico

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox