From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from lists.gentoo.org (pigeon.gentoo.org [208.92.234.80]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by finch.gentoo.org (Postfix) with ESMTPS id C4C281395E2 for ; Wed, 7 Dec 2016 01:58:35 +0000 (UTC) Received: from pigeon.gentoo.org (localhost [127.0.0.1]) by pigeon.gentoo.org (Postfix) with SMTP id 6D071E0CB9; Wed, 7 Dec 2016 01:58:34 +0000 (UTC) Received: from smtp.gentoo.org (smtp.gentoo.org [140.211.166.183]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by pigeon.gentoo.org (Postfix) with ESMTPS id 38C50E0CB9 for ; Wed, 7 Dec 2016 01:58:34 +0000 (UTC) Received: from oystercatcher.gentoo.org (oystercatcher.gentoo.org [148.251.78.52]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.gentoo.org (Postfix) with ESMTPS id E77C5341236 for ; Wed, 7 Dec 2016 01:58:32 +0000 (UTC) Received: from localhost.localdomain (localhost [127.0.0.1]) by oystercatcher.gentoo.org (Postfix) with ESMTP id 4F54224B6 for ; Wed, 7 Dec 2016 01:58:31 +0000 (UTC) From: "Mart Raudsepp" To: gentoo-commits@lists.gentoo.org Content-Transfer-Encoding: 8bit Content-type: text/plain; charset=UTF-8 Reply-To: gentoo-dev@lists.gentoo.org, "Mart Raudsepp" Message-ID: <1481075760.dde4a3a9c8fbe76897219886f21d046392d65730.leio@gentoo> Subject: [gentoo-commits] proj/grumpy:master commit in: backend/lib/ X-VCS-Repository: proj/grumpy X-VCS-Files: backend/lib/sync.py X-VCS-Directories: backend/lib/ X-VCS-Committer: leio X-VCS-Committer-Name: Mart Raudsepp X-VCS-Revision: dde4a3a9c8fbe76897219886f21d046392d65730 X-VCS-Branch: master Date: Wed, 7 Dec 2016 01:58:31 +0000 (UTC) Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-Id: Gentoo Linux mail X-BeenThere: gentoo-commits@lists.gentoo.org X-Archives-Salt: 2168757c-5de7-42bf-908c-74b196cd1409 X-Archives-Hash: aa4b5179720f4f7a5eeb500bf88e2597 commit: dde4a3a9c8fbe76897219886f21d046392d65730 Author: Mart Raudsepp gentoo org> AuthorDate: Wed Dec 7 01:56:00 2016 +0000 Commit: Mart Raudsepp gentoo org> CommitDate: Wed Dec 7 01:56:00 2016 +0000 URL: https://gitweb.gentoo.org/proj/grumpy.git/commit/?id=dde4a3a9 sync: Add package description and maintainers sync Maintains a sync timestamp to skip recently synced packages, so if a previous run got stuck, we can skip re-doing it too soon. Saves the DB transaction after every 100 packages, because packages.g.o seems to rate-limit us, so at least we will have things saved into DB periodically to cancel out when we get stuck and restart. backend/lib/sync.py | 49 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/backend/lib/sync.py b/backend/lib/sync.py index e53fa9b..567da2d 100644 --- a/backend/lib/sync.py +++ b/backend/lib/sync.py @@ -1,8 +1,11 @@ import xml.etree.ElementTree as ET import requests +import time +from datetime import datetime from .. import app, db from .models import Category, Maintainer, Package, PackageVersion +SYNC_BUFFER_SECS = 30*60 proj_url = "https://api.gentoo.org/metastructure/projects.xml" pkg_url_base = "https://packages.gentoo.org/" http_session = requests.session() @@ -144,11 +147,49 @@ def sync_packages(): db.session.commit() def sync_versions(): - for package in Package.query.all(): + cnt = 0 + ts = datetime.utcfromtimestamp(time.time() - SYNC_BUFFER_SECS) + now = datetime.utcnow() + existing_maintainers = {} + for maintainer in Maintainer.query.all(): + existing_maintainers[maintainer.email] = maintainer + + for package in Package.query.filter(Package.last_sync_ts < ts).all(): + cnt += 1 data = http_session.get(pkg_url_base + "packages/" + package.full_name + ".json") if not data: print("No JSON data for package %s" % package.full_name) # FIXME: Handle better; e.g mark the package as removed if no pkgmove update continue - from pprint import pprint - pprint(data.json()) - break + + pkg = data.json() + + print ("Updating package: %s" % package.full_name) + if 'description' in pkg: + package.description = pkg['description'] + + maintainers = [] + if 'maintainers' in pkg: + for maint in pkg['maintainers']: + if 'email' not in maint: + print("WARNING: Package %s was told to have a maintainer without an e-mail identifier" % package.full_name) + continue + if maint['email'] in existing_maintainers: # FIXME: Some proxy-maintainers are using mixed case e-mail address, right now we'd be creating duplicates right now if the case is different across different packages + maintainers.append(existing_maintainers[maint['email']]) + else: + is_project = False + if 'type' in maint and maint['type'] == 'project': + is_project = True + print("Adding %s maintainer %s" % ("project" if is_project else "individual", maint['email'])) + new_maintainer = Maintainer(email=maint['email'], is_project=is_project, name=maint['name'] if 'name' in maint else None) + db.session.add(new_maintainer) + existing_maintainers[maint['email']] = new_maintainer + maintainers.append(new_maintainer) + + # Intentionally outside if 'maintainers' in pkg, because if there are no maintainers in JSON, it's falled to maintainer-needed and we need to clean out old maintainer entries + package.maintainers = maintainers # TODO: Retain order to know who is primary; retain description associated with the maintainership + package.last_sync_ts = now + + if not cnt % 100: + print("%d packages updated, committing DB transaction" % cnt) + db.session.commit() + now = datetime.utcnow()