From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from lists.gentoo.org (pigeon.gentoo.org [208.92.234.80]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by finch.gentoo.org (Postfix) with ESMTPS id 8C6F7138334 for ; Thu, 19 Dec 2019 21:02:54 +0000 (UTC) Received: from pigeon.gentoo.org (localhost [127.0.0.1]) by pigeon.gentoo.org (Postfix) with SMTP id 915CBE0960; Thu, 19 Dec 2019 21:02:50 +0000 (UTC) Received: from smtp.gentoo.org (woodpecker.gentoo.org [IPv6:2001:470:ea4a:1:5054:ff:fec7:86e4]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by pigeon.gentoo.org (Postfix) with ESMTPS id 55585E0960 for ; Thu, 19 Dec 2019 21:02:50 +0000 (UTC) Received: from oystercatcher.gentoo.org (oystercatcher.gentoo.org [148.251.78.52]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.gentoo.org (Postfix) with ESMTPS id 777E034DA3F for ; Thu, 19 Dec 2019 21:02:48 +0000 (UTC) Received: from localhost.localdomain (localhost [IPv6:::1]) by oystercatcher.gentoo.org (Postfix) with ESMTP id 9CC7298F for ; Thu, 19 Dec 2019 21:02:44 +0000 (UTC) From: "Göktürk Yüksek" To: gentoo-commits@lists.gentoo.org Content-Transfer-Encoding: 8bit Content-type: text/plain; charset=UTF-8 Reply-To: gentoo-dev@lists.gentoo.org, "Göktürk Yüksek" Message-ID: <1576789082.926e0d0855afa40f5dcbc16b1b7c66187afd7d73.gokturk@gentoo> Subject: [gentoo-commits] proj/devmanual:master commit in: /, bin/ X-VCS-Repository: proj/devmanual X-VCS-Files: bin/build_search_documents.py search.js X-VCS-Directories: / bin/ X-VCS-Committer: gokturk X-VCS-Committer-Name: Göktürk Yüksek X-VCS-Revision: 926e0d0855afa40f5dcbc16b1b7c66187afd7d73 X-VCS-Branch: master Date: Thu, 19 Dec 2019 21:02:44 +0000 (UTC) Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-Id: Gentoo Linux mail X-BeenThere: gentoo-commits@lists.gentoo.org X-Auto-Response-Suppress: DR, RN, NRN, OOF, AutoReply X-Archives-Salt: 317424d3-a4df-4cf8-b3e3-12c95c2c91e5 X-Archives-Hash: 13d30a581130bb15fd368858ed83eba1 commit: 926e0d0855afa40f5dcbc16b1b7c66187afd7d73 Author: Göktürk Yüksek gentoo org> AuthorDate: Tue Dec 10 02:08:12 2019 +0000 Commit: Göktürk Yüksek gentoo org> CommitDate: Thu Dec 19 20:58:02 2019 +0000 URL: https://gitweb.gentoo.org/proj/devmanual.git/commit/?id=926e0d08 Rewrite the search functionality and extend the coverage The current script only indexes the first

in a text.xml, and sometimes only partially if the text is interrupted by another tag such as . Modify build_search_documents.py such that: - It recursively traverses from chapter all the way down to subsubsection - Each

, , , is indexed separately - In the search results, the match entry will have the title in the form "Chapter[ -> Section[ -> Subsection[ -> Subsubsection]]]" Modify search.js such that: - The ref returned for a match is its index into "documents" array, which makes it possible to retrieve the document in O(1). Signed-off-by: Göktürk Yüksek gentoo.org> bin/build_search_documents.py | 112 ++++++++++++++++++++++++++++++++++++------ search.js | 22 ++++----- 2 files changed, 108 insertions(+), 26 deletions(-) diff --git a/bin/build_search_documents.py b/bin/build_search_documents.py index 9af2753..3816fdb 100755 --- a/bin/build_search_documents.py +++ b/bin/build_search_documents.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # Copyright 2019 Gentoo Authors # Distributed under the terms of the GNU GPL version 2 or later import json @@ -6,19 +6,103 @@ import os.path import sys import xml.etree.ElementTree as ET -files = sys.argv[1:] -documents = [] -url_root = 'https://devmanual.gentoo.org/' -for f in files: - tree = ET.parse(f) - root = tree.getroot() - for chapter in root.findall('chapter'): +def stringify_node(parent: ET.Element) -> str: + """Flatten this node and its immediate children to a string. + + Combine the text and tail of this node, and any of its immediate + children, if there are any, into a flat string. The tag is a + special case that resolves to the dash ('-') character. + + Keyword arguments: + parent -- the node to convert to a string + + """ + if parent.text: + text = parent.text.lstrip() + else: + text = str() + + for child in parent.getchildren(): + # The '' tag is simply a fancier '-' character + if child.tag == 'd': + text += '-' + if child.text: + text += child.text.lstrip() + if child.tail: + text += child.tail.rstrip() + + text += parent.tail.rstrip() + return text.replace('\n', ' ') + + +def process_node(documents: list, node: ET.Element, name: str, url: str) -> None: + """Recursively process a given node and its children based on tag values. + + For the top level node , extract the title and recurse + down to the children. + For the intermediary nodes with titles, such as

, update + the search result title and url, and recurse down. + For the terminal nodes, such as

, convert the contents of the + node to a string, and add it to the search documents. + + Keyword arguments: + documents -- the search documents array + node -- the node to process + name -- the title to display for the search term match + url -- the url for the search term match in the document + + """ + if node.tag == 'chapter': + name = stringify_node(node.find('title')) + + for child in node: + process_node(documents, child, name, url) + elif node.tag in ['section', 'subsection', 'subsubsection']: + title = stringify_node(node.find('title')) + name += ' -> ' + title + url = "{url_base}#{anchor}".format( + url_base=url.split('#')[0], + anchor=title.lower().replace(' ', '-')) + + for child in node: + process_node(documents, child, name, url) + elif node.tag in ['body', 'guide']: + for child in node: + process_node(documents, child, name, url) + elif node.tag in ['p', 'important', 'note', 'warning']: + text = stringify_node(node) + + documents.append({'id': len(documents), + 'name': name, + 'text': text, + 'url': url}) + else: + pass + + +def main(pathnames: list) -> None: + """The entry point of the script. + + Keyword arguments: + pathnames -- a list of path names to process in sequential order + """ + url_root = 'https://devmanual.gentoo.org/' + documents = [] + + for path in pathnames: + tree = ET.parse(path) + root = tree.getroot() + try: - documents.append({"name": chapter.find('title').text, - "text": chapter.find('body').find('p').text, - "url": url_root + os.path.dirname(f) + '/'}) - except AttributeError: - pass + url = url_root + os.path.dirname(path) + '/' + + process_node(documents, root, None, url) + except: + raise + + print('var documents = ' + json.dumps(documents) + ';') + -print('var documents = ' + json.dumps(documents) + ';') +if __name__ in '__main__': + main(sys.argv[1:]) diff --git a/search.js b/search.js index 0b9292f..ab28f87 100644 --- a/search.js +++ b/search.js @@ -5,9 +5,9 @@ "use strict"; var search_index = lunr(function () { - this.ref('name'); + this.ref('id'); this.field('text'); - this.field('url'); + this.metadataWhitelist = ['position'] documents.forEach(function (doc) { this.add(doc); @@ -23,15 +23,13 @@ search_input.addEventListener("keyup", function(event) { } }); -function getContents(docs, article) { - var contents = { text: "", url: "" }; +function getContents(docs, uid) { + var contents = { name: "", text: "", url: "" }; + + contents.name = docs[uid].name; + contents.text = docs[uid].text; + contents.url = docs[uid].url; - for (var i = 0; i< docs.length; i++) { - if (docs[i].name == article) { - contents.text = docs[i].text; - contents.url = docs[i].url; - } - } return contents; } @@ -42,8 +40,8 @@ function search() { if (results.length > 0) { $("#searchResults .modal-body").empty(); $.each(results, function(index, result) { - var title = result.ref; - var contents = getContents(documents, title); + var uid = result.ref; + var contents = getContents(documents, uid); $("#searchResults .modal-body").append(`

`);