* [gentoo-portage-dev] [PATCH] dohtml: handle unicode (bug 561846)
@ 2015-10-03 22:33 Zac Medico
2015-10-04 21:12 ` Brian Dolbec
0 siblings, 1 reply; 2+ messages in thread
From: Zac Medico @ 2015-10-03 22:33 UTC (permalink / raw
To: gentoo-portage-dev; +Cc: Zac Medico
Decode all arguments and listdir results as UTF-8, and return
unsuccessfully if anything fails to decode as UTF-8. Use portage
os and shutil wrappers to encode file names as UTF-8 regardless
of locale.
X-Gentoo-Bug: 561846
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=561846
---
bin/dohtml.py | 47 +++++++++++++++++++++++++++++++++++------------
1 file changed, 35 insertions(+), 12 deletions(-)
diff --git a/bin/dohtml.py b/bin/dohtml.py
index 5359f5e..dfcaa60 100755
--- a/bin/dohtml.py
+++ b/bin/dohtml.py
@@ -28,13 +28,13 @@
# - will do as 'dohtml -r', but ignore directories named CVS, SCCS, RCS
#
-from __future__ import print_function
+from __future__ import print_function, unicode_literals
-import os
-import shutil
+import os as _os
import sys
-from portage.util import normalize_path
+from portage import _unicode_encode, _unicode_decode, os, shutil
+from portage.util import normalize_path, writemsg
# Change back to original cwd _after_ all imports (bug #469338).
os.chdir(os.environ["__PORTAGE_HELPER_CWD"])
@@ -92,7 +92,13 @@ def install(basename, dirname, options, prefix=""):
skipped_files.append(fullpath)
elif options.recurse and os.path.isdir(fullpath) and \
basename not in options.disallowed_dirs:
- for i in os.listdir(fullpath):
+ for i in _os.listdir(_unicode_encode(fullpath)):
+ try:
+ i = _unicode_decode(i, errors='strict')
+ except UnicodeDecodeError:
+ writemsg('dohtml: argument is not encoded as UTF-8: %s\n' %
+ _unicode_decode(i), noiselevel=-1)
+ sys.exit(1)
pfx = basename
if prefix:
pfx = os.path.join(prefix, pfx)
@@ -155,12 +161,29 @@ def print_help():
print()
def parse_args():
+ argv = sys.argv[:]
+
+ if sys.hexversion >= 0x3000000:
+ # We can't trust that the filesystem encoding (locale dependent)
+ # correctly matches the arguments, so use surrogateescape to
+ # pass through the original argv bytes for Python 3.
+ fs_encoding = sys.getfilesystemencoding()
+ argv = [x.encode(fs_encoding, 'surrogateescape') for x in argv]
+
+ for x, arg in enumerate(argv):
+ try:
+ argv[x] = _unicode_decode(arg, errors='strict')
+ except UnicodeDecodeError:
+ writemsg('dohtml: argument is not encoded as UTF-8: %s\n' %
+ _unicode_decode(arg), noiselevel=-1)
+ sys.exit(1)
+
options = OptionsClass()
args = []
x = 1
- while x < len(sys.argv):
- arg = sys.argv[x]
+ while x < len(argv):
+ arg = argv[x]
if arg in ["-h","-r","-V"]:
if arg == "-h":
print_help()
@@ -169,17 +192,17 @@ def parse_args():
options.recurse = True
elif arg == "-V":
options.verbose = True
- elif sys.argv[x] in ["-A","-a","-f","-x","-p"]:
+ elif argv[x] in ["-A","-a","-f","-x","-p"]:
x += 1
- if x == len(sys.argv):
+ if x == len(argv):
print_help()
sys.exit(0)
elif arg == "-p":
- options.doc_prefix = sys.argv[x]
+ options.doc_prefix = argv[x]
if options.doc_prefix:
options.doc_prefix = normalize_path(options.doc_prefix)
else:
- values = sys.argv[x].split(",")
+ values = argv[x].split(",")
if arg == "-A":
options.allowed_exts.extend(values)
elif arg == "-a":
@@ -189,7 +212,7 @@ def parse_args():
elif arg == "-x":
options.disallowed_dirs = values
else:
- args.append(sys.argv[x])
+ args.append(argv[x])
x += 1
return (options, args)
--
2.4.6
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [gentoo-portage-dev] [PATCH] dohtml: handle unicode (bug 561846)
2015-10-03 22:33 [gentoo-portage-dev] [PATCH] dohtml: handle unicode (bug 561846) Zac Medico
@ 2015-10-04 21:12 ` Brian Dolbec
0 siblings, 0 replies; 2+ messages in thread
From: Brian Dolbec @ 2015-10-04 21:12 UTC (permalink / raw
To: gentoo-portage-dev
On Sat, 3 Oct 2015 15:33:03 -0700
Zac Medico <zmedico@gentoo.org> wrote:
> Decode all arguments and listdir results as UTF-8, and return
> unsuccessfully if anything fails to decode as UTF-8. Use portage
> os and shutil wrappers to encode file names as UTF-8 regardless
> of locale.
>
> X-Gentoo-Bug: 561846
> X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=561846
> ---
> bin/dohtml.py | 47 +++++++++++++++++++++++++++++++++++------------
> 1 file changed, 35 insertions(+), 12 deletions(-)
>
> diff --git a/bin/dohtml.py b/bin/dohtml.py
> index 5359f5e..dfcaa60 100755
> --- a/bin/dohtml.py
> +++ b/bin/dohtml.py
> @@ -28,13 +28,13 @@
> # - will do as 'dohtml -r', but ignore directories named CVS, SCCS,
> RCS #
>
> -from __future__ import print_function
> +from __future__ import print_function, unicode_literals
>
> -import os
> -import shutil
> +import os as _os
> import sys
>
> -from portage.util import normalize_path
> +from portage import _unicode_encode, _unicode_decode, os, shutil
> +from portage.util import normalize_path, writemsg
>
> # Change back to original cwd _after_ all imports (bug #469338).
> os.chdir(os.environ["__PORTAGE_HELPER_CWD"])
> @@ -92,7 +92,13 @@ def install(basename, dirname, options, prefix=""):
> skipped_files.append(fullpath)
> elif options.recurse and os.path.isdir(fullpath) and \
> basename not in options.disallowed_dirs:
> - for i in os.listdir(fullpath):
> + for i in _os.listdir(_unicode_encode(fullpath)):
> + try:
> + i = _unicode_decode(i,
> errors='strict')
> + except UnicodeDecodeError:
> + writemsg('dohtml: argument is not
> encoded as UTF-8: %s\n' %
> + _unicode_decode(i),
> noiselevel=-1)
> + sys.exit(1)
> pfx = basename
> if prefix:
> pfx = os.path.join(prefix, pfx)
> @@ -155,12 +161,29 @@ def print_help():
> print()
>
> def parse_args():
> + argv = sys.argv[:]
> +
> + if sys.hexversion >= 0x3000000:
> + # We can't trust that the filesystem encoding
> (locale dependent)
> + # correctly matches the arguments, so use
> surrogateescape to
> + # pass through the original argv bytes for Python 3.
> + fs_encoding = sys.getfilesystemencoding()
> + argv = [x.encode(fs_encoding, 'surrogateescape') for
> x in argv] +
> + for x, arg in enumerate(argv):
> + try:
> + argv[x] = _unicode_decode(arg,
> errors='strict')
> + except UnicodeDecodeError:
> + writemsg('dohtml: argument is not encoded as
> UTF-8: %s\n' %
> + _unicode_decode(arg), noiselevel=-1)
> + sys.exit(1)
> +
> options = OptionsClass()
> args = []
>
> x = 1
> - while x < len(sys.argv):
> - arg = sys.argv[x]
> + while x < len(argv):
> + arg = argv[x]
> if arg in ["-h","-r","-V"]:
> if arg == "-h":
> print_help()
> @@ -169,17 +192,17 @@ def parse_args():
> options.recurse = True
> elif arg == "-V":
> options.verbose = True
> - elif sys.argv[x] in ["-A","-a","-f","-x","-p"]:
> + elif argv[x] in ["-A","-a","-f","-x","-p"]:
> x += 1
> - if x == len(sys.argv):
> + if x == len(argv):
> print_help()
> sys.exit(0)
> elif arg == "-p":
> - options.doc_prefix = sys.argv[x]
> + options.doc_prefix = argv[x]
> if options.doc_prefix:
> options.doc_prefix =
> normalize_path(options.doc_prefix) else:
> - values = sys.argv[x].split(",")
> + values = argv[x].split(",")
> if arg == "-A":
> options.allowed_exts.extend(values)
> elif arg == "-a":
> @@ -189,7 +212,7 @@ def parse_args():
> elif arg == "-x":
> options.disallowed_dirs =
> values else:
> - args.append(sys.argv[x])
> + args.append(argv[x])
> x += 1
>
> return (options, args)
Looks good
--
Brian Dolbec <dolsen>
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2015-10-04 21:13 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-10-03 22:33 [gentoo-portage-dev] [PATCH] dohtml: handle unicode (bug 561846) Zac Medico
2015-10-04 21:12 ` Brian Dolbec
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox