|
10 | 10 | import locale
|
11 | 11 | import os
|
12 | 12 | import sys
|
13 |
| -import codecs |
14 | 13 |
|
15 | 14 |
|
16 | 15 | from gitdb.utils.compat import (
|
@@ -91,181 +90,3 @@ def __str__(self):
|
91 | 90 | else: # Python 2
|
92 | 91 | def __str__(self):
|
93 | 92 | return self.__unicode__().encode(defenc)
|
94 |
| - |
95 |
| - |
96 |
| -""" |
97 |
| -This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error |
98 |
| -handler of Python 3. |
99 |
| -Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc |
100 |
| -""" |
101 |
| - |
102 |
| -# This code is released under the Python license and the BSD 2-clause license |
103 |
| - |
104 |
| - |
105 |
| -FS_ERRORS = 'surrogateescape' |
106 |
| - |
107 |
| -# # -- Python 2/3 compatibility ------------------------------------- |
108 |
| -# FS_ERRORS = 'my_surrogateescape' |
109 |
| - |
110 |
| -def u(text): |
111 |
| - if PY3: |
112 |
| - return text |
113 |
| - return text.decode('unicode_escape') |
114 |
| - |
115 |
| -def b(data): |
116 |
| - if PY3: |
117 |
| - return data.encode('latin1') |
118 |
| - return data |
119 |
| - |
120 |
| -def surrogateescape_handler(exc): |
121 |
| - """ |
122 |
| - Pure Python implementation of the PEP 383: the "surrogateescape" error |
123 |
| - handler of Python 3. Undecodable bytes will be replaced by a Unicode |
124 |
| - character U+DCxx on decoding, and these are translated into the |
125 |
| - original bytes on encoding. |
126 |
| - """ |
127 |
| - mystring = exc.object[exc.start:exc.end] |
128 |
| - |
129 |
| - try: |
130 |
| - if isinstance(exc, UnicodeDecodeError): |
131 |
| - # mystring is a byte-string in this case |
132 |
| - decoded = replace_surrogate_decode(mystring) |
133 |
| - elif isinstance(exc, UnicodeEncodeError): |
134 |
| - # In the case of u'\udcc3'.encode('ascii', |
135 |
| - # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an |
136 |
| - # exception anyway after this function is called, even though I think |
137 |
| - # it's doing what it should. It seems that the strict encoder is called |
138 |
| - # to encode the unicode string that this function returns ... |
139 |
| - decoded = replace_surrogate_encode(mystring, exc) |
140 |
| - else: |
141 |
| - raise exc |
142 |
| - except NotASurrogateError: |
143 |
| - raise exc |
144 |
| - return (decoded, exc.end) |
145 |
| - |
146 |
| - |
147 |
| -class NotASurrogateError(Exception): |
148 |
| - pass |
149 |
| - |
150 |
| - |
151 |
| -def replace_surrogate_encode(mystring, exc): |
152 |
| - """ |
153 |
| - Returns a (unicode) string, not the more logical bytes, because the codecs |
154 |
| - register_error functionality expects this. |
155 |
| - """ |
156 |
| - decoded = [] |
157 |
| - for ch in mystring: |
158 |
| - # if PY3: |
159 |
| - # code = ch |
160 |
| - # else: |
161 |
| - code = ord(ch) |
162 |
| - |
163 |
| - # The following magic comes from Py3.3's Python/codecs.c file: |
164 |
| - if not 0xD800 <= code <= 0xDCFF: |
165 |
| - # Not a surrogate. Fail with the original exception. |
166 |
| - raise exc |
167 |
| - # mybytes = [0xe0 | (code >> 12), |
168 |
| - # 0x80 | ((code >> 6) & 0x3f), |
169 |
| - # 0x80 | (code & 0x3f)] |
170 |
| - # Is this a good idea? |
171 |
| - if 0xDC00 <= code <= 0xDC7F: |
172 |
| - decoded.append(chr(code - 0xDC00)) |
173 |
| - elif code <= 0xDCFF: |
174 |
| - decoded.append(chr(code - 0xDC00)) |
175 |
| - else: |
176 |
| - raise NotASurrogateError |
177 |
| - return str().join(decoded) |
178 |
| - |
179 |
| - |
180 |
| -def replace_surrogate_decode(mybytes): |
181 |
| - """ |
182 |
| - Returns a (unicode) string |
183 |
| - """ |
184 |
| - decoded = [] |
185 |
| - for ch in mybytes: |
186 |
| - # We may be parsing newbytes (in which case ch is an int) or a native |
187 |
| - # str on Py2 |
188 |
| - if isinstance(ch, int): |
189 |
| - code = ch |
190 |
| - else: |
191 |
| - code = ord(ch) |
192 |
| - if 0x80 <= code <= 0xFF: |
193 |
| - decoded.append(chr(0xDC00 + code)) |
194 |
| - elif code <= 0x7F: |
195 |
| - decoded.append(chr(code)) |
196 |
| - else: |
197 |
| - # # It may be a bad byte |
198 |
| - # # Try swallowing it. |
199 |
| - # continue |
200 |
| - # print("RAISE!") |
201 |
| - raise NotASurrogateError |
202 |
| - return str().join(decoded) |
203 |
| - |
204 |
| - |
205 |
| -def encodefilename(fn): |
206 |
| - if FS_ENCODING == 'ascii': |
207 |
| - # ASCII encoder of Python 2 expects that the error handler returns a |
208 |
| - # Unicode string encodable to ASCII, whereas our surrogateescape error |
209 |
| - # handler has to return bytes in 0x80-0xFF range. |
210 |
| - encoded = [] |
211 |
| - for index, ch in enumerate(fn): |
212 |
| - code = ord(ch) |
213 |
| - if code < 128: |
214 |
| - ch = bytes((code,)) |
215 |
| - elif 0xDC80 <= code <= 0xDCFF: |
216 |
| - ch = bytes((code - 0xDC00,)) |
217 |
| - else: |
218 |
| - raise UnicodeEncodeError(FS_ENCODING, |
219 |
| - fn, index, index+1, |
220 |
| - 'ordinal not in range(128)') |
221 |
| - encoded.append(ch) |
222 |
| - return bytes().join(encoded) |
223 |
| - elif FS_ENCODING == 'utf-8': |
224 |
| - # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF |
225 |
| - # doesn't go through our error handler |
226 |
| - encoded = [] |
227 |
| - for index, ch in enumerate(fn): |
228 |
| - code = ord(ch) |
229 |
| - if 0xD800 <= code <= 0xDFFF: |
230 |
| - if 0xDC80 <= code <= 0xDCFF: |
231 |
| - ch = bytes((code - 0xDC00,)) |
232 |
| - encoded.append(ch) |
233 |
| - else: |
234 |
| - raise UnicodeEncodeError( |
235 |
| - FS_ENCODING, |
236 |
| - fn, index, index+1, 'surrogates not allowed') |
237 |
| - else: |
238 |
| - ch_utf8 = ch.encode('utf-8') |
239 |
| - encoded.append(ch_utf8) |
240 |
| - return bytes().join(encoded) |
241 |
| - return fn.encode(FS_ENCODING, FS_ERRORS) |
242 |
| - |
243 |
| -def decodefilename(fn): |
244 |
| - return fn.decode(FS_ENCODING, FS_ERRORS) |
245 |
| - |
246 |
| -FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') |
247 |
| -# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') |
248 |
| -# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') |
249 |
| - |
250 |
| - |
251 |
| -# normalize the filesystem encoding name. |
252 |
| -# For example, we expect "utf-8", not "UTF8". |
253 |
| -FS_ENCODING = codecs.lookup(FS_ENCODING).name |
254 |
| - |
255 |
| - |
256 |
| -def register_surrogateescape(): |
257 |
| - """ |
258 |
| - Registers the surrogateescape error handler on Python 2 (only) |
259 |
| - """ |
260 |
| - if PY3: |
261 |
| - return |
262 |
| - try: |
263 |
| - codecs.lookup_error(FS_ERRORS) |
264 |
| - except LookupError: |
265 |
| - codecs.register_error(FS_ERRORS, surrogateescape_handler) |
266 |
| - |
267 |
| - |
268 |
| -try: |
269 |
| - b"100644 \x9f\0aaa".decode(defenc, "surrogateescape") |
270 |
| -except Exception: |
271 |
| - register_surrogateescape() |
0 commit comments