From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from lists.gentoo.org (pigeon.gentoo.org [208.92.234.80]) by finch.gentoo.org (Postfix) with ESMTP id B888A138BD2 for ; Sun, 2 Nov 2014 22:50:18 +0000 (UTC) Received: from pigeon.gentoo.org (localhost [127.0.0.1]) by pigeon.gentoo.org (Postfix) with SMTP id 36D90E089B; Sun, 2 Nov 2014 22:50:18 +0000 (UTC) Received: from smtp.gentoo.org (smtp.gentoo.org [140.211.166.183]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by pigeon.gentoo.org (Postfix) with ESMTPS id A7011E0898 for ; Sun, 2 Nov 2014 22:50:17 +0000 (UTC) Received: from localhost.localdomain (ip70-181-96-121.oc.oc.cox.net [70.181.96.121]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) (Authenticated sender: zmedico) by smtp.gentoo.org (Postfix) with ESMTPSA id 96E3F3403B9; Sun, 2 Nov 2014 22:50:16 +0000 (UTC) From: Zac Medico To: gentoo-portage-dev@lists.gentoo.org Cc: Zac Medico Subject: [gentoo-portage-dev] [PATCH 2/5 v3] Add IndexStreamIterator and MultiIterGroupBy. Date: Sun, 2 Nov 2014 14:50:11 -0800 Message-Id: <1414968611-20093-1-git-send-email-zmedico@gentoo.org> X-Mailer: git-send-email 2.0.4 In-Reply-To: <1414881983-19877-3-git-send-email-zmedico@gentoo.org> References: <1414881983-19877-3-git-send-email-zmedico@gentoo.org> Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-Id: Gentoo Linux mail X-BeenThere: gentoo-portage-dev@lists.gentoo.org Reply-to: gentoo-portage-dev@lists.gentoo.org X-Archives-Salt: 4bdbe7bf-e245-4c59-b6b7-8299ac97c87e X-Archives-Hash: 90f0edafc020b9947ab092dbc4b757c5 This IndexStreamIterator class can be used together with the pkg_desc_index_line_read function to read an index file incrementally as a stream. The MultiIterGroupBy class can be used to iterate over multiple IndexStreamIterator instances at once, incrementally grouping results for a particular package from multiple indices, while limiting the amount of any given index that must be in memory at once. Both of these classes are used by the IndexedPortdb class in the next patch of this series. X-Gentoo-Bug: 525718 X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718 --- This updated patch cleans up the logic (possibly fixing bugs), and optimizes it to avoid over-buffering (memory waste). Also, I added a TODO note to use a binary search tree to optimize the search for completed groups. pym/portage/cache/index/IndexStreamIterator.py | 27 ++++++++ pym/portage/util/iterators/MultiIterGroupBy.py | 94 ++++++++++++++++++++++++++ pym/portage/util/iterators/__init__.py | 2 + 3 files changed, 123 insertions(+) create mode 100644 pym/portage/cache/index/IndexStreamIterator.py create mode 100644 pym/portage/util/iterators/MultiIterGroupBy.py create mode 100644 pym/portage/util/iterators/__init__.py diff --git a/pym/portage/cache/index/IndexStreamIterator.py b/pym/portage/cache/index/IndexStreamIterator.py new file mode 100644 index 0000000..972aee1 --- /dev/null +++ b/pym/portage/cache/index/IndexStreamIterator.py @@ -0,0 +1,27 @@ +# Copyright 2014 Gentoo Foundation +# Distributed under the terms of the GNU General Public License v2 + +class IndexStreamIterator(object): + + def __init__(self, f, parser): + + self.parser = parser + self._file = f + + def close(self): + + if self._file is not None: + self._file.close() + self._file = None + + def __iter__(self): + + try: + + for line in self._file: + node = self.parser(line) + if node is not None: + yield node + + finally: + self.close() diff --git a/pym/portage/util/iterators/MultiIterGroupBy.py b/pym/portage/util/iterators/MultiIterGroupBy.py new file mode 100644 index 0000000..2d8652e --- /dev/null +++ b/pym/portage/util/iterators/MultiIterGroupBy.py @@ -0,0 +1,94 @@ +# Copyright 2014 Gentoo Foundation +# Distributed under the terms of the GNU General Public License v2 + +class MultiIterGroupBy(object): + """ + This class functions similarly to the itertools.groupby function, + except that it takes multiple source iterators as input. The source + iterators must yield objects in sorted order. A group is yielded as + soon as the progress of all iterators reaches a state which + guarantees that there can not be any remaining (unseen) elements of + the group. This is useful for incremental display of grouped search + results. + """ + + def __init__(self, iterators, key = None): + self._iterators = iterators + self._key = key + + def __iter__(self): + + trackers = [] + for iterator in self._iterators: + trackers.append(_IteratorTracker(iterator)) + + key_map = {} + eof = [] + key_getter = self._key + if key_getter is None: + key_getter = lambda x: x + min_progress = None + + while trackers: + + for tracker in trackers: + + if tracker.current is not None and \ + tracker.current != min_progress: + # The trackers are sorted by progress, so the + # remaining trackers are guaranteed to have + # sufficient progress. + continue + + # In order to avoid over-buffering (waste of memory), + # only grab a single entry. + try: + entry = next(tracker.iterator) + except StopIteration: + eof.append(tracker) + else: + tracker.current = key_getter(entry) + key_group = key_map.get(tracker.current) + if key_group is None: + key_group = [] + key_map[tracker.current] = key_group + key_group.append(entry) + + if eof: + for tracker in eof: + trackers.remove(tracker) + del eof[:] + + if trackers: + trackers.sort() + min_progress = trackers[0].current + yield_these = [] + # TODO: Use a binary search tree to optimize this loop. + for k in key_map: + if k <= min_progress: + yield_these.append(k) + else: + yield_these = list(key_map) + + if yield_these: + yield_these.sort() + for k in yield_these: + yield key_map.pop(k) + +class _IteratorTracker(object): + + __slots__ = ('current', 'iterator') + + def __init__(self, iterator): + + self.iterator = iterator + self.current = None + + def __lt__(self, other): + if self.current is None: + if other.current is None: + return False + else: + return True + return other.current is not None and \ + self.current < other.current diff --git a/pym/portage/util/iterators/__init__.py b/pym/portage/util/iterators/__init__.py new file mode 100644 index 0000000..7cd880e --- /dev/null +++ b/pym/portage/util/iterators/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2014 Gentoo Foundation +# Distributed under the terms of the GNU General Public License v2 -- 2.0.4