public inbox for gentoo-portage-dev@lists.gentoo.org
 help / color / mirror / Atom feed
* [gentoo-portage-dev] [PATCH] dohtml: handle unicode (bug 561846)
@ 2015-10-03 22:33 Zac Medico
  2015-10-04 21:12 ` Brian Dolbec
  0 siblings, 1 reply; 2+ messages in thread
From: Zac Medico @ 2015-10-03 22:33 UTC (permalink / raw
  To: gentoo-portage-dev; +Cc: Zac Medico

Decode all arguments and listdir results as UTF-8, and return
unsuccessfully if anything fails to decode as UTF-8. Use portage
os and shutil wrappers to encode file names as UTF-8 regardless
of locale.

X-Gentoo-Bug: 561846
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=561846
---
 bin/dohtml.py | 47 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/bin/dohtml.py b/bin/dohtml.py
index 5359f5e..dfcaa60 100755
--- a/bin/dohtml.py
+++ b/bin/dohtml.py
@@ -28,13 +28,13 @@
 #  - will do as 'dohtml -r', but ignore directories named CVS, SCCS, RCS
 #
 
-from __future__ import print_function
+from __future__ import print_function, unicode_literals
 
-import os
-import shutil
+import os as _os
 import sys
 
-from portage.util import normalize_path
+from portage import _unicode_encode, _unicode_decode, os, shutil
+from portage.util import normalize_path, writemsg
 
 # Change back to original cwd _after_ all imports (bug #469338).
 os.chdir(os.environ["__PORTAGE_HELPER_CWD"])
@@ -92,7 +92,13 @@ def install(basename, dirname, options, prefix=""):
 			skipped_files.append(fullpath)
 	elif options.recurse and os.path.isdir(fullpath) and \
 	     basename not in options.disallowed_dirs:
-		for i in os.listdir(fullpath):
+		for i in _os.listdir(_unicode_encode(fullpath)):
+			try:
+				i = _unicode_decode(i, errors='strict')
+			except UnicodeDecodeError:
+				writemsg('dohtml: argument is not encoded as UTF-8: %s\n' %
+					_unicode_decode(i), noiselevel=-1)
+				sys.exit(1)
 			pfx = basename
 			if prefix:
 				pfx = os.path.join(prefix, pfx)
@@ -155,12 +161,29 @@ def print_help():
 	print()
 
 def parse_args():
+	argv = sys.argv[:]
+
+	if sys.hexversion >= 0x3000000:
+		# We can't trust that the filesystem encoding (locale dependent)
+		# correctly matches the arguments, so use surrogateescape to
+		# pass through the original argv bytes for Python 3.
+		fs_encoding = sys.getfilesystemencoding()
+		argv = [x.encode(fs_encoding, 'surrogateescape') for x in argv]
+
+	for x, arg in enumerate(argv):
+		try:
+			argv[x] = _unicode_decode(arg, errors='strict')
+		except UnicodeDecodeError:
+			writemsg('dohtml: argument is not encoded as UTF-8: %s\n' %
+				_unicode_decode(arg), noiselevel=-1)
+			sys.exit(1)
+
 	options = OptionsClass()
 	args = []
 
 	x = 1
-	while x < len(sys.argv):
-		arg = sys.argv[x]
+	while x < len(argv):
+		arg = argv[x]
 		if arg in ["-h","-r","-V"]:
 			if arg == "-h":
 				print_help()
@@ -169,17 +192,17 @@ def parse_args():
 				options.recurse = True
 			elif arg == "-V":
 				options.verbose = True
-		elif sys.argv[x] in ["-A","-a","-f","-x","-p"]:
+		elif argv[x] in ["-A","-a","-f","-x","-p"]:
 			x += 1
-			if x == len(sys.argv):
+			if x == len(argv):
 				print_help()
 				sys.exit(0)
 			elif arg == "-p":
-				options.doc_prefix = sys.argv[x]
+				options.doc_prefix = argv[x]
 				if options.doc_prefix:
 					options.doc_prefix = normalize_path(options.doc_prefix)
 			else:
-				values = sys.argv[x].split(",")
+				values = argv[x].split(",")
 				if arg == "-A":
 					options.allowed_exts.extend(values)
 				elif arg == "-a":
@@ -189,7 +212,7 @@ def parse_args():
 				elif arg == "-x":
 					options.disallowed_dirs = values
 		else:
-			args.append(sys.argv[x])
+			args.append(argv[x])
 		x += 1
 
 	return (options, args)
-- 
2.4.6



^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [gentoo-portage-dev] [PATCH] dohtml: handle unicode (bug 561846)
  2015-10-03 22:33 [gentoo-portage-dev] [PATCH] dohtml: handle unicode (bug 561846) Zac Medico
@ 2015-10-04 21:12 ` Brian Dolbec
  0 siblings, 0 replies; 2+ messages in thread
From: Brian Dolbec @ 2015-10-04 21:12 UTC (permalink / raw
  To: gentoo-portage-dev

On Sat,  3 Oct 2015 15:33:03 -0700
Zac Medico <zmedico@gentoo.org> wrote:

> Decode all arguments and listdir results as UTF-8, and return
> unsuccessfully if anything fails to decode as UTF-8. Use portage
> os and shutil wrappers to encode file names as UTF-8 regardless
> of locale.
> 
> X-Gentoo-Bug: 561846
> X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=561846
> ---
>  bin/dohtml.py | 47 +++++++++++++++++++++++++++++++++++------------
>  1 file changed, 35 insertions(+), 12 deletions(-)
> 
> diff --git a/bin/dohtml.py b/bin/dohtml.py
> index 5359f5e..dfcaa60 100755
> --- a/bin/dohtml.py
> +++ b/bin/dohtml.py
> @@ -28,13 +28,13 @@
>  #  - will do as 'dohtml -r', but ignore directories named CVS, SCCS,
> RCS #
>  
> -from __future__ import print_function
> +from __future__ import print_function, unicode_literals
>  
> -import os
> -import shutil
> +import os as _os
>  import sys
>  
> -from portage.util import normalize_path
> +from portage import _unicode_encode, _unicode_decode, os, shutil
> +from portage.util import normalize_path, writemsg
>  
>  # Change back to original cwd _after_ all imports (bug #469338).
>  os.chdir(os.environ["__PORTAGE_HELPER_CWD"])
> @@ -92,7 +92,13 @@ def install(basename, dirname, options, prefix=""):
>  			skipped_files.append(fullpath)
>  	elif options.recurse and os.path.isdir(fullpath) and \
>  	     basename not in options.disallowed_dirs:
> -		for i in os.listdir(fullpath):
> +		for i in _os.listdir(_unicode_encode(fullpath)):
> +			try:
> +				i = _unicode_decode(i,
> errors='strict')
> +			except UnicodeDecodeError:
> +				writemsg('dohtml: argument is not
> encoded as UTF-8: %s\n' %
> +					_unicode_decode(i),
> noiselevel=-1)
> +				sys.exit(1)
>  			pfx = basename
>  			if prefix:
>  				pfx = os.path.join(prefix, pfx)
> @@ -155,12 +161,29 @@ def print_help():
>  	print()
>  
>  def parse_args():
> +	argv = sys.argv[:]
> +
> +	if sys.hexversion >= 0x3000000:
> +		# We can't trust that the filesystem encoding
> (locale dependent)
> +		# correctly matches the arguments, so use
> surrogateescape to
> +		# pass through the original argv bytes for Python 3.
> +		fs_encoding = sys.getfilesystemencoding()
> +		argv = [x.encode(fs_encoding, 'surrogateescape') for
> x in argv] +
> +	for x, arg in enumerate(argv):
> +		try:
> +			argv[x] = _unicode_decode(arg,
> errors='strict')
> +		except UnicodeDecodeError:
> +			writemsg('dohtml: argument is not encoded as
> UTF-8: %s\n' %
> +				_unicode_decode(arg), noiselevel=-1)
> +			sys.exit(1)
> +
>  	options = OptionsClass()
>  	args = []
>  
>  	x = 1
> -	while x < len(sys.argv):
> -		arg = sys.argv[x]
> +	while x < len(argv):
> +		arg = argv[x]
>  		if arg in ["-h","-r","-V"]:
>  			if arg == "-h":
>  				print_help()
> @@ -169,17 +192,17 @@ def parse_args():
>  				options.recurse = True
>  			elif arg == "-V":
>  				options.verbose = True
> -		elif sys.argv[x] in ["-A","-a","-f","-x","-p"]:
> +		elif argv[x] in ["-A","-a","-f","-x","-p"]:
>  			x += 1
> -			if x == len(sys.argv):
> +			if x == len(argv):
>  				print_help()
>  				sys.exit(0)
>  			elif arg == "-p":
> -				options.doc_prefix = sys.argv[x]
> +				options.doc_prefix = argv[x]
>  				if options.doc_prefix:
>  					options.doc_prefix =
> normalize_path(options.doc_prefix) else:
> -				values = sys.argv[x].split(",")
> +				values = argv[x].split(",")
>  				if arg == "-A":
>  					options.allowed_exts.extend(values)
>  				elif arg == "-a":
> @@ -189,7 +212,7 @@ def parse_args():
>  				elif arg == "-x":
>  					options.disallowed_dirs =
> values else:
> -			args.append(sys.argv[x])
> +			args.append(argv[x])
>  		x += 1
>  
>  	return (options, args)

Looks good

-- 
Brian Dolbec <dolsen>



^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2015-10-04 21:13 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-10-03 22:33 [gentoo-portage-dev] [PATCH] dohtml: handle unicode (bug 561846) Zac Medico
2015-10-04 21:12 ` Brian Dolbec

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox