public inbox for gentoo-commits@lists.gentoo.org
 help / color / mirror / Atom feed
* [gentoo-commits] proj/portage:master commit in: lib/portage/_emirrordist/, lib/portage/tests/ebuild/, man/, ...
@ 2021-02-27  7:52 Zac Medico
  0 siblings, 0 replies; only message in thread
From: Zac Medico @ 2021-02-27  7:52 UTC (permalink / raw
  To: gentoo-commits

commit:     fd04c5fb1619f86381b5d5e6ff66b20fa3967c43
Author:     Zac Medico <zmedico <AT> gentoo <DOT> org>
AuthorDate: Wed Feb 24 19:56:38 2021 +0000
Commit:     Zac Medico <zmedico <AT> gentoo <DOT> org>
CommitDate: Sat Feb 27 07:43:23 2021 +0000
URL:        https://gitweb.gentoo.org/proj/portage.git/commit/?id=fd04c5fb

emirrordist: add --content-db option required for content-hash layout (bug 756778)

Add a --content-db option which is required for the content-hash
layout because its file listings return content digests instead of
distfile names.

The content db serves to translate content digests to distfiles
names, and distfiles names to content digests. All keys have one or
more prefixes separated by colons. For a digest key, the first
prefix is "digest" and the second prefix is the hash algorithm name.
For a filename key, the prefix is "filename".

The value associated with a digest key is a set of file names. The
value associated with a distfile key is a set of content revisions.
Each content revision is expressed as a dictionary of digests which
is suitable for construction of a DistfileName instance.

A given content digest will translate to multiple distfile names if
multiple associations have been created via the content db add
method. The relationship between a content digest and a distfile
name is similar to the relationship between an inode and a hardlink.

Bug: https://bugs.gentoo.org/756778
Signed-off-by: Zac Medico <zmedico <AT> gentoo.org>

 lib/portage/_emirrordist/Config.py           |   6 +
 lib/portage/_emirrordist/ContentDB.py        | 196 +++++++++++++++++++++++++++
 lib/portage/_emirrordist/DeletionIterator.py |  25 +++-
 lib/portage/_emirrordist/DeletionTask.py     |   8 ++
 lib/portage/_emirrordist/FetchTask.py        |   5 +-
 lib/portage/_emirrordist/main.py             |  15 +-
 lib/portage/package/ebuild/fetch.py          |   8 +-
 lib/portage/tests/ebuild/test_fetch.py       | 148 ++++++++++++++++++++
 man/emirrordist.1                            |   6 +-
 9 files changed, 407 insertions(+), 10 deletions(-)

diff --git a/lib/portage/_emirrordist/Config.py b/lib/portage/_emirrordist/Config.py
index 1c7a27d66..a4b75809f 100644
--- a/lib/portage/_emirrordist/Config.py
+++ b/lib/portage/_emirrordist/Config.py
@@ -10,6 +10,7 @@ import time
 from portage import os
 from portage.package.ebuild.fetch import MirrorLayoutConfig
 from portage.util import grabdict, grablines
+from .ContentDB import ContentDB
 
 class Config:
 	def __init__(self, options, portdb, event_loop):
@@ -65,6 +66,11 @@ class Config:
 			self.distfiles_db = self._open_shelve(
 				options.distfiles_db, 'distfiles')
 
+		self.content_db = None
+		if getattr(options, 'content_db', None) is not None:
+			self.content_db = ContentDB(self._open_shelve(
+				options.content_db, 'content'))
+
 		self.deletion_db = None
 		if getattr(options, 'deletion_db', None) is not None:
 			self.deletion_db = self._open_shelve(

diff --git a/lib/portage/_emirrordist/ContentDB.py b/lib/portage/_emirrordist/ContentDB.py
new file mode 100644
index 000000000..d9ce3cc45
--- /dev/null
+++ b/lib/portage/_emirrordist/ContentDB.py
@@ -0,0 +1,196 @@
+# Copyright 2021 Gentoo Authors
+# Distributed under the terms of the GNU General Public License v2
+
+import logging
+import operator
+import shelve
+import typing
+
+from portage.package.ebuild.fetch import DistfileName
+
+
+class ContentDB:
+	"""
+	The content db serves to translate content digests to distfiles
+	names, and distfiles names to content digests. All keys have one or
+	more prefixes separated by colons. For a digest key, the first
+	prefix is "digest" and the second prefix is the hash algorithm name.
+	For a filename key, the prefix is "filename".
+
+	The value associated with a digest key is a set of file names. The
+	value associated with a distfile key is a set of content revisions.
+	Each content revision is expressed as a dictionary of digests which
+	is suitable for construction of a DistfileName instance.
+	"""
+
+	def __init__(self, shelve_instance: shelve.Shelf):
+		self._shelve = shelve_instance
+
+	def add(self, filename: DistfileName):
+		"""
+		Add file name and digests, creating a new content revision, or
+		incrementing the reference count to an identical content revision
+		if one exists. If the file name had previous content revisions,
+		then they continue to exist independently of the new one.
+
+		@param filename: file name with digests attribute
+		"""
+		distfile_str = str(filename)
+		distfile_key = "filename:{}".format(distfile_str)
+		for k, v in filename.digests.items():
+			if k != "size":
+				digest_key = "digest:{}:{}".format(k.upper(), v.lower())
+				try:
+					digest_files = self._shelve[digest_key]
+				except KeyError:
+					digest_files = set()
+				digest_files.add(distfile_str)
+				self._shelve[digest_key] = digest_files
+		try:
+			content_revisions = self._shelve[distfile_key]
+		except KeyError:
+			content_revisions = set()
+
+		revision_key = tuple(
+			sorted(
+				(
+					(algo.upper(), filename.digests[algo.upper()].lower())
+					for algo in filename.digests
+					if algo != "size"
+				),
+				key=operator.itemgetter(0),
+			)
+		)
+		content_revisions.add(revision_key)
+		self._shelve[distfile_key] = content_revisions
+
+	def remove(self, filename: DistfileName):
+		"""
+		Remove a file name and digests from the database. If identical
+		content is still referenced by one or more other file names,
+		then those references are preserved (like removing one of many
+		hardlinks). Also, this file name may reference other content
+		revisions with different digests, and those content revisions
+		will remain as well.
+
+		@param filename: file name with digests attribute
+		"""
+		distfile_key = "filename:{}".format(filename)
+		try:
+			content_revisions = self._shelve[distfile_key]
+		except KeyError:
+			pass
+		else:
+			remaining = set()
+			for revision_key in content_revisions:
+				if not any(digest_item in revision_key for digest_item in filename.digests.items()):
+					remaining.add(revision_key)
+					continue
+				for k, v in revision_key:
+					digest_key = "digest:{}:{}".format(k, v)
+					try:
+						digest_files = self._shelve[digest_key]
+					except KeyError:
+						digest_files = set()
+
+					try:
+						digest_files.remove(filename)
+					except KeyError:
+						pass
+
+					if digest_files:
+						self._shelve[digest_key] = digest_files
+					else:
+						try:
+							del self._shelve[digest_key]
+						except KeyError:
+							pass
+
+			if remaining:
+				logging.debug(("drop '%s' revision(s) from content db") % filename)
+				self._shelve[distfile_key] = remaining
+			else:
+				logging.debug(("drop '%s' from content db") % filename)
+				try:
+					del self._shelve[distfile_key]
+				except KeyError:
+					pass
+
+	def get_filenames_translate(
+		self, filename: typing.Union[str, DistfileName]
+	) -> typing.Generator[DistfileName, None, None]:
+		"""
+		Translate distfiles content digests to zero or more distfile names.
+		If filename is already a distfile name, then it will pass
+		through unchanged.
+
+		A given content digest will translate to multiple distfile names if
+		multiple associations have been created via the add method. The
+		relationship between a content digest and a distfile name is similar
+		to the relationship between an inode and a hardlink.
+
+		@param filename: A filename listed by layout get_filenames
+		"""
+		if not isinstance(filename, DistfileName):
+			filename = DistfileName(filename)
+
+		# Match content digests with zero or more content revisions.
+		matched_revisions = {}
+
+		for k, v in filename.digests.items():
+			digest_item = (k.upper(), v.lower())
+			digest_key = "digest:{}:{}".format(*digest_item)
+			try:
+				digest_files = self._shelve[digest_key]
+			except KeyError:
+				continue
+
+			for distfile_str in digest_files:
+				matched_revisions.setdefault(distfile_str, set())
+				try:
+					content_revisions = self._shelve["filename:{}".format(distfile_str)]
+				except KeyError:
+					pass
+				else:
+					for revision_key in content_revisions:
+						if (
+							digest_item in revision_key
+							and revision_key not in matched_revisions[distfile_str]
+						):
+							matched_revisions[distfile_str].add(revision_key)
+							yield DistfileName(distfile_str, digests=dict(revision_key))
+
+		if not any(matched_revisions.values()):
+			# Since filename matched zero content revisions, allow
+			# it to pass through unchanged (on the path toward deletion).
+			yield filename
+
+	def __len__(self):
+		return len(self._shelve)
+
+	def __contains__(self, k):
+		return k in self._shelve
+
+	def __iter__(self):
+		return self._shelve.__iter__()
+
+	def items(self):
+		return self._shelve.items()
+
+	def __setitem__(self, k, v):
+		self._shelve[k] = v
+
+	def __getitem__(self, k):
+		return self._shelve[k]
+
+	def __delitem__(self, k):
+		del self._shelve[k]
+
+	def get(self, k, *args):
+		return self._shelve.get(k, *args)
+
+	def close(self):
+		self._shelve.close()
+
+	def clear(self):
+		self._shelve.clear()

diff --git a/lib/portage/_emirrordist/DeletionIterator.py b/lib/portage/_emirrordist/DeletionIterator.py
index 08985ed6c..ab4309f9a 100644
--- a/lib/portage/_emirrordist/DeletionIterator.py
+++ b/lib/portage/_emirrordist/DeletionIterator.py
@@ -1,10 +1,12 @@
-# Copyright 2013-2019 Gentoo Authors
+# Copyright 2013-2021 Gentoo Authors
 # Distributed under the terms of the GNU General Public License v2
 
+import itertools
 import logging
 import stat
 
 from portage import os
+from portage.package.ebuild.fetch import DistfileName
 from .DeletionTask import DeletionTask
 
 class DeletionIterator:
@@ -21,8 +23,25 @@ class DeletionIterator:
 		deletion_delay = self._config.options.deletion_delay
 		start_time = self._config.start_time
 		distfiles_set = set()
-		for layout in self._config.layouts:
-			distfiles_set.update(layout.get_filenames(distdir))
+		distfiles_set.update(
+			(
+				filename
+				if isinstance(filename, DistfileName)
+				else DistfileName(filename)
+				for filename in itertools.chain.from_iterable(
+					layout.get_filenames(distdir) for layout in self._config.layouts
+				)
+			)
+			if self._config.content_db is None
+			else itertools.chain.from_iterable(
+				(
+					self._config.content_db.get_filenames_translate(filename)
+					for filename in itertools.chain.from_iterable(
+						layout.get_filenames(distdir) for layout in self._config.layouts
+					)
+				)
+			)
+		)
 		for filename in distfiles_set:
 			# require at least one successful stat()
 			exceptions = []

diff --git a/lib/portage/_emirrordist/DeletionTask.py b/lib/portage/_emirrordist/DeletionTask.py
index 5eb01d840..73493c5a1 100644
--- a/lib/portage/_emirrordist/DeletionTask.py
+++ b/lib/portage/_emirrordist/DeletionTask.py
@@ -5,6 +5,7 @@ import errno
 import logging
 
 from portage import os
+from portage.package.ebuild.fetch import ContentHashLayout
 from portage.util._async.FileCopier import FileCopier
 from _emerge.CompositeTask import CompositeTask
 
@@ -99,6 +100,10 @@ class DeletionTask(CompositeTask):
 	def _delete_links(self):
 		success = True
 		for layout in self.config.layouts:
+			if isinstance(layout, ContentHashLayout) and not self.distfile.digests:
+				logging.debug(("_delete_links: '%s' has "
+					"no digests") % self.distfile)
+				continue
 			distfile_path = os.path.join(
 				self.config.options.distfiles,
 				layout.get_path(self.distfile))
@@ -134,6 +139,9 @@ class DeletionTask(CompositeTask):
 				logging.debug(("drop '%s' from "
 					"distfiles db") % self.distfile)
 
+		if self.config.content_db is not None:
+			self.config.content_db.remove(self.distfile)
+
 		if self.config.deletion_db is not None:
 			try:
 				del self.config.deletion_db[self.distfile]

diff --git a/lib/portage/_emirrordist/FetchTask.py b/lib/portage/_emirrordist/FetchTask.py
index 997762082..5a48f91cd 100644
--- a/lib/portage/_emirrordist/FetchTask.py
+++ b/lib/portage/_emirrordist/FetchTask.py
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Gentoo Authors
+# Copyright 2013-2021 Gentoo Authors
 # Distributed under the terms of the GNU General Public License v2
 
 import collections
@@ -47,6 +47,9 @@ class FetchTask(CompositeTask):
 			# Convert _pkg_str to str in order to prevent pickle problems.
 			self.config.distfiles_db[self.distfile] = str(self.cpv)
 
+		if self.config.content_db is not None:
+			self.config.content_db.add(self.distfile)
+
 		if not self._have_needed_digests():
 			msg = "incomplete digests: %s" % " ".join(self.digests)
 			self.scheduler.output(msg, background=self.background,

diff --git a/lib/portage/_emirrordist/main.py b/lib/portage/_emirrordist/main.py
index 8d00a05f5..2200ec715 100644
--- a/lib/portage/_emirrordist/main.py
+++ b/lib/portage/_emirrordist/main.py
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Gentoo Authors
+# Copyright 2013-2021 Gentoo Authors
 # Distributed under the terms of the GNU General Public License v2
 
 import argparse
@@ -7,6 +7,7 @@ import sys
 
 import portage
 from portage import os
+from portage.package.ebuild.fetch import ContentHashLayout
 from portage.util import normalize_path, _recursive_file_list
 from portage.util._async.run_main_scheduler import run_main_scheduler
 from portage.util._async.SchedulerInterface import SchedulerInterface
@@ -151,6 +152,12 @@ common_options = (
 			"distfile belongs to",
 		"metavar"  : "FILE"
 	},
+	{
+		"longopt"  : "--content-db",
+		"help"     : "database file used to map content digests to"
+			"distfiles names (required for content-hash layout)",
+		"metavar"  : "FILE"
+	},
 	{
 		"longopt"  : "--recycle-dir",
 		"help"     : "directory for extended retention of files that "
@@ -441,6 +448,12 @@ def emirrordist_main(args):
 		if not options.mirror:
 			parser.error('No action specified')
 
+		if options.delete and config.content_db is None:
+			for layout in config.layouts:
+				if isinstance(layout, ContentHashLayout):
+					parser.error("content-hash layout requires "
+						"--content-db to be specified")
+
 		returncode = os.EX_OK
 
 		if options.mirror:

diff --git a/lib/portage/package/ebuild/fetch.py b/lib/portage/package/ebuild/fetch.py
index a683793f0..73abec595 100644
--- a/lib/portage/package/ebuild/fetch.py
+++ b/lib/portage/package/ebuild/fetch.py
@@ -365,10 +365,10 @@ class DistfileName(str):
 	In order to prepare for a migration from filename-hash to
 	content-hash layout, all consumers of the layout get_filenames
 	method need to be updated to work with content digests as a
-	substitute for distfile names. For example, in order to prepare
-	emirrordist for content-hash, a key-value store needs to be
-	added as a means to associate distfile names with content
-	digest values yielded by the content-hash get_filenames
+	substitute for distfile names. For example, emirrordist requires
+	the --content-db option when working with a content-hash layout,
+	which serves as a means to associate distfile names
+	with content digest values yielded by the content-hash get_filenames
 	implementation.
 	"""
 	def __new__(cls, s, digests=None):

diff --git a/lib/portage/tests/ebuild/test_fetch.py b/lib/portage/tests/ebuild/test_fetch.py
index d50a4cbfc..24990e4db 100644
--- a/lib/portage/tests/ebuild/test_fetch.py
+++ b/lib/portage/tests/ebuild/test_fetch.py
@@ -4,6 +4,7 @@
 import functools
 import io
 import tempfile
+import types
 
 import portage
 from portage import shutil, os
@@ -28,6 +29,7 @@ from portage.package.ebuild.fetch import (
 	FlatLayout,
 	MirrorLayoutConfig,
 )
+from portage._emirrordist.Config import Config as EmirrordistConfig
 from _emerge.EbuildFetcher import EbuildFetcher
 from _emerge.Package import Package
 
@@ -172,6 +174,16 @@ class EbuildFetchTestCase(TestCase):
 				with open(os.path.join(settings['DISTDIR'], 'layout.conf'), 'wt') as f:
 					f.write(layout_data)
 
+				if any(isinstance(layout, ContentHashLayout) for layout in layouts):
+					content_db = os.path.join(playground.eprefix, 'var/db/emirrordist/content.db')
+					os.makedirs(os.path.dirname(content_db), exist_ok=True)
+					try:
+						os.unlink(content_db)
+					except OSError:
+						pass
+				else:
+					content_db = None
+
 				# Demonstrate that fetch preserves a stale file in DISTDIR when no digests are given.
 				foo_uri = {'foo': ('{scheme}://{host}:{port}/distfiles/foo'.format(scheme=scheme, host=host, port=server.server_port),)}
 				foo_path = os.path.join(settings['DISTDIR'], 'foo')
@@ -233,9 +245,13 @@ class EbuildFetchTestCase(TestCase):
 					os.path.join(self.bindir, 'emirrordist'),
 					'--distfiles', settings['DISTDIR'],
 					'--config-root', settings['EPREFIX'],
+					'--delete',
 					'--repositories-configuration', settings.repositories.config_string(),
 					'--repo', 'test_repo', '--mirror')
 
+				if content_db is not None:
+					emirrordist_cmd = emirrordist_cmd + ('--content-db', content_db,)
+
 				env = settings.environ()
 				env['PYTHONPATH'] = ':'.join(
 					filter(None, [PORTAGE_PYM_PATH] + os.environ.get('PYTHONPATH', '').split(':')))
@@ -253,6 +269,19 @@ class EbuildFetchTestCase(TestCase):
 					with open(os.path.join(settings['DISTDIR'], layouts[0].get_path(k)), 'rb') as f:
 						self.assertEqual(f.read(), distfiles[k])
 
+				if content_db is not None:
+					loop.run_until_complete(
+						self._test_content_db(
+							emirrordist_cmd,
+							env,
+							layouts,
+							content_db,
+							distfiles,
+							settings,
+							portdb,
+						)
+					)
+
 				# Tests only work with one ebuild at a time, so the config
 				# pool only needs a single config instance.
 				class config_pool:
@@ -427,6 +456,125 @@ class EbuildFetchTestCase(TestCase):
 						settings.features.remove('skiprocheck')
 						settings.features.add('distlocks')
 
+	async def _test_content_db(
+		self, emirrordist_cmd, env, layouts, content_db, distfiles, settings, portdb
+	):
+		# Simulate distfile digest change for ContentDB.
+		emdisopts = types.SimpleNamespace(
+			content_db=content_db, distfiles=settings["DISTDIR"]
+		)
+		with EmirrordistConfig(
+			emdisopts, portdb, asyncio.get_event_loop()
+		) as emdisconf:
+			# Copy revisions from bar to foo.
+			for revision_key in emdisconf.content_db["filename:{}".format("bar")]:
+				emdisconf.content_db.add(
+					DistfileName("foo", digests=dict(revision_key))
+				)
+
+			# Copy revisions from foo to bar.
+			for revision_key in emdisconf.content_db["filename:{}".format("foo")]:
+				emdisconf.content_db.add(
+					DistfileName("bar", digests=dict(revision_key))
+				)
+
+			content_db_state = dict(emdisconf.content_db.items())
+			self.assertEqual(content_db_state, dict(emdisconf.content_db.items()))
+			self.assertEqual(
+				[
+					k[len("filename:") :]
+					for k in content_db_state
+					if k.startswith("filename:")
+				],
+				["bar", "foo"],
+			)
+			self.assertEqual(
+				content_db_state["filename:foo"], content_db_state["filename:bar"]
+			)
+			self.assertEqual(len(content_db_state["filename:foo"]), 2)
+
+		for k in distfiles:
+			try:
+				os.unlink(os.path.join(settings["DISTDIR"], k))
+			except OSError:
+				pass
+
+		proc = await asyncio.create_subprocess_exec(*emirrordist_cmd, env=env)
+		self.assertEqual(await proc.wait(), 0)
+
+		for k in distfiles:
+			with open(
+				os.path.join(settings["DISTDIR"], layouts[0].get_path(k)), "rb"
+			) as f:
+				self.assertEqual(f.read(), distfiles[k])
+
+		with EmirrordistConfig(
+			emdisopts, portdb, asyncio.get_event_loop()
+		) as emdisconf:
+			self.assertEqual(content_db_state, dict(emdisconf.content_db.items()))
+
+			# Verify that remove works as expected
+			filename = [filename for filename in distfiles if filename == "foo"][0]
+			self.assertTrue(bool(filename.digests))
+			emdisconf.content_db.remove(filename)
+			# foo should still have a content revision corresponding to bar's content.
+			self.assertEqual(
+				[
+					k[len("filename:") :]
+					for k in emdisconf.content_db
+					if k.startswith("filename:")
+				],
+				["bar", "foo"],
+			)
+			self.assertEqual(len(emdisconf.content_db["filename:foo"]), 1)
+			self.assertEqual(
+				len(
+					[
+						revision_key
+						for revision_key in emdisconf.content_db["filename:foo"]
+						if not filename.digests_equal(
+							DistfileName(
+								"foo",
+								digests=dict(revision_key),
+							)
+						)
+					]
+				),
+				1,
+			)
+			# bar should still have a content revision corresponding to foo's content.
+			self.assertEqual(len(emdisconf.content_db["filename:bar"]), 2)
+			self.assertEqual(
+				len(
+					[
+						revision_key
+						for revision_key in emdisconf.content_db["filename:bar"]
+						if filename.digests_equal(
+							DistfileName(
+								"bar",
+								digests=dict(revision_key),
+							)
+						)
+					]
+				),
+				1,
+			)
+			# remove the foo which refers to bar's content
+			bar = [filename for filename in distfiles if filename == "bar"][0]
+			foo_remaining = DistfileName("foo", digests=bar.digests)
+			emdisconf.content_db.remove(foo_remaining)
+			self.assertEqual(
+				[
+					k[len("filename:") :]
+					for k in emdisconf.content_db
+					if k.startswith("filename:")
+				],
+				["bar"],
+			)
+			self.assertRaises(KeyError, emdisconf.content_db.__getitem__, "filename:foo")
+			# bar should still have a content revision corresponding to foo's content.
+			self.assertEqual(len(emdisconf.content_db["filename:bar"]), 2)
+
 	def test_flat_layout(self):
 		self.assertTrue(FlatLayout.verify_args(('flat',)))
 		self.assertFalse(FlatLayout.verify_args(('flat', 'extraneous-arg')))

diff --git a/man/emirrordist.1 b/man/emirrordist.1
index 45108ef8c..7ad10dfd0 100644
--- a/man/emirrordist.1
+++ b/man/emirrordist.1
@@ -1,4 +1,4 @@
-.TH "EMIRRORDIST" "1" "Dec 2015" "Portage VERSION" "Portage"
+.TH "EMIRRORDIST" "1" "Feb 2021" "Portage VERSION" "Portage"
 .SH "NAME"
 emirrordist \- a fetch tool for mirroring of package distfiles
 .SH SYNOPSIS
@@ -66,6 +66,10 @@ reporting purposes. Opened in append mode.
 Log file for scheduled deletions, with tab\-delimited output, for
 reporting purposes. Overwritten with each run.
 .TP
+\fB\-\-content\-db\fR=\fIFILE\fR
+Database file used to pair content digests with distfiles names
+(required fo content\-hash layout).
+.TP
 \fB\-\-delete\fR
 Enable deletion of unused distfiles.
 .TP


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2021-02-27  7:52 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-02-27  7:52 [gentoo-commits] proj/portage:master commit in: lib/portage/_emirrordist/, lib/portage/tests/ebuild/, man/, Zac Medico

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox