File: | blib/lib/Encode/Wide.pm |
Coverage: | 84.4% |
line | stmt | bran | cond | sub | time | code |
---|---|---|---|---|---|---|
1 | package Encode::Wide; | |||||
2 | ||||||
3 | # TODO: don't transform anything within <script>...</script> in wide_to_html | |||||
4 | ||||||
5 | 3 3 3 | 68419 2 44 | use strict; | |||
6 | 3 3 3 | 5 2 54 | use warnings; | |||
7 | ||||||
8 | 3 3 3 | 6 1 29 | use Exporter qw(import); | |||
9 | 3 3 3 | 574 6869 120 | use HTML::Entities; | |||
10 | 3 3 3 | 567 16166 56 | use Params::Get 0.13; | |||
11 | 3 3 3 | 854 11510 3902 | use Term::ANSIColor; | |||
12 | ||||||
13 | our @EXPORT_OK = qw(wide_to_html wide_to_xml); | |||||
14 | ||||||
15 | # Encode to HTML whatever the non-ASCII encoding scheme has been chosen | |||||
16 | # Can't use HTML:Entities::encode since that doesn't seem to cope with | |||||
17 | # all encodings and misses some characters | |||||
18 | # | |||||
19 | # See https://www.compart.com/en/unicode/U+0161 etc. | |||||
20 | # https://www.compart.com/en/unicode/U+00EB | |||||
21 | # | |||||
22 | # keep_hrefs => 1 means ensure hyperlinks still work | |||||
23 | # keep_apos => 1 means keep apostrophes, useful within <script> | |||||
24 | ||||||
25 | =encoding UTF-8 | |||||
26 | ||||||
27 - 35 | =head1 NAME Encode::Wide - Convert wide characters (Unicode, UTF-8, etc.) into HTML or XML-safe ASCII entities =head1 VERSION 0.06 =cut | |||||
36 | ||||||
37 | our $VERSION = 0.06; | |||||
38 | ||||||
39 - 104 | =head1 SYNOPSIS use Encode::Wide qw(wide_to_html wide_to_xml); my $html = wide_to_html(string => "Café déjà vu â naïve façade"); # returns: 'Café déjà vu – naïve façade' my $xml = wide_to_xml(string => "Café déjà vu â naïve façade"); # returns: 'Café déjà vu – naïve façade' =head1 DESCRIPTION Encode::Wide provides functions for converting wide (Unicode) characters into ASCII-safe formats suitable for embedding in HTML or XML documents. It is especially useful when dealing with text containing accented or typographic characters that need to be safely represented in markup. Other modules exist to do this, however they tend to have assumptions on the input, whereas this should work with UTF-8, Unicode, or anything that's common. The module offers two exportable functions: =over 4 =item * C<wide_to_html(string => $text)> Converts all non-ASCII characters in the input string to their named HTML entities if available, or hexadecimal numeric entities otherwise. Common characters such as `é`, `à `, `&`, `<`, `>` are converted to their standard HTML representations like `é`, `à`, `&`, etc. =item * C<wide_to_xml(string => $text)> Converts all non-ASCII characters in the input string to hexadecimal numeric entities. Unlike HTML, XML does not support many named entities, so this function ensures compliance by using numeric representations such as `é` for `é`. =back =head1 PARAMETERS Both functions accept a named parameter: =over 4 =item * C<string> â The Unicode string to convert. =back =head1 ENCODING Input strings are expected to be valid UTF-8 or Unicode. If a byte string is passed, the module will attempt to decode it appropriately. Output is guaranteed to be pure ASCII. =head1 EXPORT None by default. Optionally exportable: wide_to_html wide_to_xml =cut | |||||
105 | ||||||
106 | sub wide_to_html | |||||
107 | { | |||||
108 | 25 | 280542 | my $params = Params::Get::get_params('string', @_); | |||
109 | ||||||
110 | 24 | 324 | my $string = $params->{'string'}; | |||
111 | 24 | 23 | my $complain = $params->{'complain'}; | |||
112 | ||||||
113 | 24 | 32 | if(!defined($string)) { | |||
114 | 0 | 0 | my $i = 0; | |||
115 | 0 | 0 | while((my @call_details = caller($i++))) { | |||
116 | 0 | 0 | print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n"; | |||
117 | } | |||||
118 | 0 | 0 | die 'BUG: wide_to_html() string not set'; | |||
119 | } | |||||
120 | ||||||
121 | 24 | 33 | if(ref($string) eq 'SCALAR') { | |||
122 | 9 9 | 6 13 | $string = ${$string}; | |||
123 | } | |||||
124 | ||||||
125 | # print STDERR __LINE__, ": ($string)"; | |||||
126 | # print STDERR (sprintf '%v02X', $string); | |||||
127 | # print STDERR "\n"; | |||||
128 | # my $i = 0; | |||||
129 | # while((my @call_details = caller($i++))) { | |||||
130 | # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n"; | |||||
131 | # } | |||||
132 | ||||||
133 | 24 | 57 | $string = HTML::Entities::decode($string); | |||
134 | # $string =~ s/ & / & /g; | |||||
135 | ||||||
136 | # I don't think HTML::Entities does these | |||||
137 | 24 | 44 | my %entity_map = ( | |||
138 | 'č' => 'Ä', | |||||
139 | 'ž' => 'ž', | |||||
140 | 'Š' => 'Å ', | |||||
141 | ); | |||||
142 | ||||||
143 | 24 | 67 | $string =~ s{ | |||
144 | # ([\x80-\x{10FFFF}]) | |||||
145 | (.) | |||||
146 | }{ | |||||
147 | 514 | 459 | my $cp = $1; | |||
148 | exists $entity_map{$cp} | |||||
149 | 514 | 744 | ? $entity_map{$cp} | |||
150 | : $cp | |||||
151 | }gex; | |||||
152 | ||||||
153 | # Escape only if it's not already part of an entity | |||||
154 | 24 | 52 | $string =~ s/&(?![A-Za-z#0-9]+;)/&/g; | |||
155 | ||||||
156 | 24 | 32 | unless($params->{'keep_hrefs'}) { | |||
157 | 22 | 64 | %entity_map = ( | |||
158 | '<' => '<', | |||||
159 | '>' => '>', | |||||
160 | '"' => '"', | |||||
161 | ); | |||||
162 | 22 | 31 | $string =~ s{(.)}{ | |||
163 | 458 | 348 | my $cp = $1; | |||
164 | exists $entity_map{$cp} | |||||
165 | 458 | 546 | ? $entity_map{$cp} | |||
166 | : $cp | |||||
167 | }gex; | |||||
168 | } | |||||
169 | ||||||
170 | # $string =~ s/&db=/&db=/g; | |||||
171 | # $string =~ s/&id=/&id=/g; | |||||
172 | ||||||
173 | # Table of byte-sequences->entities | |||||
174 | 24 | 82 | my @byte_map = ( | |||
175 | ['â', '"'], # U+201C | |||||
176 | ['â', '"'], # U+201D | |||||
177 | ["\xe2\x80\x9c", '"'], # â | |||||
178 | ["\xe2\x80\x9d", '"'], # â | |||||
179 | ["\xe2\x80\x93", '–'], | |||||
180 | ["\xe2\x80\x94", '—'], | |||||
181 | ["\xe2\x80\x98", '''], # â | |||||
182 | ["\xe2\x80\x99", '''], # â | |||||
183 | ["\xe2\x80\xA6", '...'], # ⦠| |||||
184 | ['!', '!'], # Do this early before the ascii check, since it's an ascii character | |||||
185 | ); | |||||
186 | ||||||
187 | 24 | 30 | $string = _sub_map(\$string, \@byte_map); | |||
188 | ||||||
189 | 24 | 38 | unless($params->{'keep_apos'}) { | |||
190 | # We can't combine since each char in the multi-byte matches, not the entire multi-byte | |||||
191 | # $string =~ s/['âââ\x98]/'/g; | |||||
192 | 22 | 45 | %entity_map = ( | |||
193 | "'" => ''', | |||||
194 | 'â' => ''', | |||||
195 | 'â' => ''', | |||||
196 | 'â' => ''', | |||||
197 | "\x98" => ''', | |||||
198 | ); | |||||
199 | ||||||
200 | 22 | 28 | $string =~ s{ | |||
201 | # ([\x80-\x{10FFFF}]) | |||||
202 | (.) | |||||
203 | }{ | |||||
204 | 571 | 316 | my $cp = $1; | |||
205 | exists $entity_map{$cp} | |||||
206 | 571 | 513 | ? $entity_map{$cp} | |||
207 | : $cp | |||||
208 | }gex; | |||||
209 | } | |||||
210 | ||||||
211 | 24 | 43 | if($string !~ /[^[:ascii:]]/) { | |||
212 | 18 | 59 | return $string; | |||
213 | } | |||||
214 | ||||||
215 | @byte_map = ( | |||||
216 | 6 | 105 | ["\xc2\xa0", ' '], # Non breaking space | |||
217 | ["\xc2\xa3", '£'], | |||||
218 | ["\xc2\xa9", '©'], | |||||
219 | ["\xc2\xae", '®'], | |||||
220 | ["\xc3\xa2", 'â'], | |||||
221 | ["\xc3\xa4", 'ä'], | |||||
222 | ["\xc3\xa9", 'é'], | |||||
223 | ["\xc2\xaa", 'ª'], # ª | |||||
224 | ["\xc2\xab", '"'], # « | |||||
225 | ["\xc2\xbb", '"'], # » | |||||
226 | ["\xc3\x81", 'Á'], # Ã | |||||
227 | ["\xc3\x83", 'Î'], # Ã | |||||
228 | ["\xc3\x9e", 'Þ'], # Ã | |||||
229 | ["\xc3\xa0", 'à'], # Ã | |||||
230 | ["\xc3\xa1", 'á'], # á | |||||
231 | ["\xc3\xad", 'í'], # Ã | |||||
232 | ["\xc3\xb0", 'ð'], # ð | |||||
233 | ["\xc3\xba", 'ú'], # ú | |||||
234 | ["\xc3\xb4", 'ô'], # ô | |||||
235 | ["\xc3\xb6", 'ö'], | |||||
236 | ["\xc3\xb8", 'ø'], # ø | |||||
237 | ["\xc5\xa1", 'š'], | |||||
238 | ["\xc4\x8d", 'č'], | |||||
239 | ["\xc5\xbe", 'ž'], | |||||
240 | ["\xc3\xa5", 'å'], # Ã¥ | |||||
241 | ["\xc3\xa7", 'ç'], | |||||
242 | ["\xc3\xaf", 'ï'], # ï | |||||
243 | ["\xc3\xb3", 'ó'], | |||||
244 | ["\xc3\x96", 'Ö'], # Ã | |||||
245 | ["\xc3\xa8", 'è'], | |||||
246 | ["\xc3\x89", 'É'], | |||||
247 | ["\xc3\x9f", 'ß'], | |||||
248 | ["\xc3\xaa", 'ê'], | |||||
249 | ["\xc3\xab", 'ë'], | |||||
250 | ["\xc3\xae", 'î'], | |||||
251 | ["\xc3\xbb", 'û'], | |||||
252 | ["\xc3\xbc", 'ü'], # ü | |||||
253 | ["\xc3\xbe", 'þ'], # þ | |||||
254 | ["\xc5\x9b", 'ś'], | |||||
255 | ["\xc5\xa0", 'Š'], | |||||
256 | ["\xe2\x80\x93", '–'], | |||||
257 | ["\xe2\x80\x94", '—'], | |||||
258 | ["\xc3\xb1", 'ñ'], # ñ | |||||
259 | ["\xe2\x80\x9c", '"'], | |||||
260 | ["\xe2\x80\x9d", '"'], | |||||
261 | ["\xe2\x80\xa6", '...'], | |||||
262 | ["\xe2\x97\x8f", '●'], # â | |||||
263 | ["\N{U+00A0}", ' '], | |||||
264 | ["\N{U+00A3}", '£'], | |||||
265 | ["\N{U+00A9}", '©'], | |||||
266 | ["\N{U+00AA}", 'ª'], # ª | |||||
267 | ["\N{U+00AB}", '"'], # « | |||||
268 | ["\N{U+00AE}", '®'], | |||||
269 | ["\N{U+00B5}", 'µ'], # µ | |||||
270 | ["\N{U+00BB}", '"'], # » | |||||
271 | ["\N{U+00CE}", 'Î'], # Ã | |||||
272 | ["\N{U+00DE}", 'Þ'], # Ã | |||||
273 | ["\N{U+0161}", 'š'], | |||||
274 | ["\N{U+010D}", 'č'], | |||||
275 | ["\N{U+017E}", 'ž'], | |||||
276 | ["\N{U+00C9}", 'É'], | |||||
277 | ["\N{U+00D6}", 'Ö'], # Ã | |||||
278 | ["\N{U+00DF}", 'ß'], # Ã | |||||
279 | ["\N{U+00E1}", 'á'], # á | |||||
280 | ["\N{U+00E2}", 'â'], | |||||
281 | ["\N{U+00E4}", 'ä'], | |||||
282 | ["\N{U+00E5}", 'å'], # Ã¥ | |||||
283 | ["\N{U+00E7}", 'ç'], # ç | |||||
284 | ["\N{U+00E8}", 'è'], | |||||
285 | ["\N{U+00E9}", 'é'], | |||||
286 | ["\N{U+00ED}", 'í'], # Ã | |||||
287 | ["\N{U+00EE}", 'î'], | |||||
288 | ["\N{U+00EF}", 'ï'], # ï | |||||
289 | ["\N{U+00F0}", 'ð'], # ð | |||||
290 | ["\N{U+00F1}", 'ñ'], # ñ | |||||
291 | ["\N{U+00F4}", 'ô'], # ô | |||||
292 | ["\N{U+00F6}", 'ö'], | |||||
293 | ["\N{U+00F8}", 'ø'], # ø | |||||
294 | ["\N{U+00FA}", 'ú'], # ú | |||||
295 | ["\N{U+00FC}", 'ü'], # ü | |||||
296 | ["\N{U+00FE}", 'þ'], # þ | |||||
297 | ["\N{U+00C1}", 'Á'], # Ã | |||||
298 | ["\N{U+00C9}", 'É'], | |||||
299 | ["\N{U+00CA}", 'ê'], | |||||
300 | ["\N{U+00EB}", 'ë'], | |||||
301 | ["\N{U+00F3}", 'ó'], | |||||
302 | ["\N{U+015B}", 'ś'], | |||||
303 | ["\N{U+00FB}", 'û'], | |||||
304 | ["\N{U+0160}", 'Š'], | |||||
305 | ["\N{U+2013}", '–'], | |||||
306 | ["\N{U+2014}", '—'], | |||||
307 | ["\N{U+2018}", '"'], | |||||
308 | ["\N{U+2019}", '"'], | |||||
309 | ["\N{U+201C}", '"'], | |||||
310 | ["\N{U+201D}", '"'], | |||||
311 | ["\N{U+2026}", '...'], # ⦠| |||||
312 | ["\N{U+25CF}", '●'], # â | |||||
313 | ); | |||||
314 | ||||||
315 | 6 | 7 | $string = _sub_map(\$string, \@byte_map); | |||
316 | ||||||
317 | # utf8::encode($string); | |||||
318 | # $string =~ s/Å¡/š/g; | |||||
319 | # $string =~ s/Ä/č/g; | |||||
320 | # $string =~ s/ž/ž/g; | |||||
321 | # $string =~ s/é/é/g; | |||||
322 | # $string =~ s/ç/ç/g; | |||||
323 | # $string =~ s/\N{U+0161}/š/g; | |||||
324 | # $string =~ s/\N{U+010D}/č/g; | |||||
325 | # $string =~ s/\N{U+017E}/ž/g; | |||||
326 | # $string =~ s/\N{U+00E9}/é/g; | |||||
327 | # $string =~ s/\N{U+00D6}/Ö/g; # Ã | |||||
328 | # $string =~ s/\N{U+00E7}/ç/g; # ç | |||||
329 | # $string =~ s/\N{U+00E8}/è/g; | |||||
330 | # $string =~ s/\N{U+00E9}/É/g; | |||||
331 | ||||||
332 | # print STDERR __LINE__, ": ($string)"; | |||||
333 | # print STDERR (sprintf '%v02X', $string); | |||||
334 | # print STDERR "\n"; | |||||
335 | ||||||
336 | # utf8::decode($string); | |||||
337 | ||||||
338 | # print STDERR __LINE__, ": ($string)"; | |||||
339 | # # print STDERR (sprintf '%v02X', $string); | |||||
340 | # print STDERR "\n"; | |||||
341 | ||||||
342 | # $string =~ s/\xe4\x8d/č/g; # ? ACOM strangeness | |||||
343 | # $string =~ s/\N{U+0161}/š/g; | |||||
344 | # $string =~ s/\N{U+010D}/č/g; | |||||
345 | # $string =~ s/\N{U+00A9}/©/g; | |||||
346 | # $string =~ s/\N{U+00AE}/®/g; | |||||
347 | # $string =~ s/\N{U+00E2}/â/g; | |||||
348 | # $string =~ s/\N{U+00E4}/ä/g; | |||||
349 | # $string =~ s/\N{U+00E8}/è/g; | |||||
350 | # $string =~ s/\N{U+00E9}/é/g; | |||||
351 | # $string =~ s/\N{U+00EB}/ë/g; | |||||
352 | # $string =~ s/\N{U+00F3}/ó/g; | |||||
353 | # $string =~ s/\N{U+00FB}/û/g; | |||||
354 | # $string =~ s/\N{U+017E}/ž/g; | |||||
355 | # $string =~ s/\N{U+00D6}/Ö/g; # Ã | |||||
356 | # $string =~ s/\N{U+00E7}/ç/g; # ç | |||||
357 | # $string =~ s/\N{U+00C9}/É/g; | |||||
358 | # $string =~ s/\N{U+00CA}/ê/g; | |||||
359 | # $string =~ s/\N{U+0160}/Š/g; # FIXME: also above | |||||
360 | # $string =~ s/\N{U+2013}/-/g; | |||||
361 | ||||||
362 | 6 | 82 | @byte_map = ( | |||
363 | [ 'Ã', 'Á' ], | |||||
364 | [ 'Ã¥', 'å' ], | |||||
365 | [ 'ª', 'ª' ], | |||||
366 | [ 'Å¡', 'š' ], | |||||
367 | [ 'Å ', 'Š' ], | |||||
368 | [ 'Ä', 'č' ], | |||||
369 | [ 'ž', 'ž' ], | |||||
370 | [ 'á', 'á' ], | |||||
371 | [ 'â', 'â' ], | |||||
372 | [ 'é', 'é' ], | |||||
373 | [ 'è', 'è' ], | |||||
374 | [ 'ç', 'ç' ], | |||||
375 | [ 'ê', 'ê' ], | |||||
376 | [ 'ë', 'ë' ], | |||||
377 | [ 'ð', 'ð' ], | |||||
378 | [ 'Ã', 'í' ], | |||||
379 | [ 'ï', 'ï' ], | |||||
380 | [ 'Ã', '&Iicrc;' ], | |||||
381 | [ '©', '©' ], | |||||
382 | [ '®', '®' ], | |||||
383 | [ 'ó', 'ó' ], | |||||
384 | [ 'ô', 'ô' ], | |||||
385 | [ 'ö', 'ö' ], | |||||
386 | [ 'ø', 'ø' ], | |||||
387 | [ 'Å', 'ś' ], | |||||
388 | [ 'Ã', 'Þ' ], | |||||
389 | [ 'þ', 'þ' ], | |||||
390 | [ 'û', 'û' ], | |||||
391 | [ 'ü', 'ü' ], | |||||
392 | [ 'ú', 'ú' ], | |||||
393 | [ 'µ', 'µ'], | |||||
394 | [ '£', '£' ], | |||||
395 | [ 'Ã', 'ß' ], | |||||
396 | [ 'â', '–' ], | |||||
397 | [ 'â', '—' ], | |||||
398 | [ 'ñ', 'ñ' ], | |||||
399 | [ 'â', '"' ], | |||||
400 | [ 'â', '"' ], | |||||
401 | [ '«', '"' ], | |||||
402 | [ '»', '"' ], | |||||
403 | [ 'â¦', '...' ], | |||||
404 | [ 'â', '●' ], | |||||
405 | [ "\x80\$", ' ' ], | |||||
406 | ); | |||||
407 | ||||||
408 | 6 | 7 | $string = _sub_map(\$string, \@byte_map); | |||
409 | ||||||
410 | # if($string =~ /^Maria\(/) { | |||||
411 | # # print STDERR (unpack 'H*', $string); | |||||
412 | # print STDERR __LINE__, ': '; | |||||
413 | # print STDERR (sprintf '%v02X', $string); | |||||
414 | # print STDERR "\n"; | |||||
415 | # my $i = 0; | |||||
416 | # while((my @call_details = caller($i++))) { | |||||
417 | # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n"; | |||||
418 | # } | |||||
419 | # # die $string; | |||||
420 | # } | |||||
421 | ||||||
422 | # print STDERR __LINE__, ": ($string)\n"; | |||||
423 | 6 | 11 | if($string =~ /[^[:ascii:]]/) { | |||
424 | 1 | 2 | $string = HTML::Entities::encode_entities_numeric($string, '\x80-\x{10FFFF}'); | |||
425 | 1 | 39 | if($string =~ /[^[:ascii:]]/) { | |||
426 | 0 | 0 | print STDERR (unpack 'H*', $string); | |||
427 | 0 | 0 | print STDERR __LINE__, ': '; | |||
428 | 0 | 0 | print STDERR (sprintf '%v02X', $string), "\n"; | |||
429 | 0 | 0 | my $i = 0; | |||
430 | 0 | 0 | while((my @call_details = caller($i++))) { | |||
431 | 0 | 0 | print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n"; | |||
432 | } | |||||
433 | 0 | 0 | $complain->("TODO: wide_to_html($string)") if($complain); | |||
434 | 0 | 0 | warn "TODO: wide_to_html($string)"; | |||
435 | # $string =~ s/[^[:ascii:]]/XXXXX/g; | |||||
436 | 0 | 0 | $string =~ s{ | |||
437 | ([^[:ascii:]]) | |||||
438 | }{ | |||||
439 | 0 | 0 | '>>>>' . sprintf("%04X", ord($1)) . '<<<<' | |||
440 | }gex; # e=evaluate, g=global, x=extended | |||||
441 | 0 | 0 | die "BUG: wide_to_html($string)"; | |||
442 | } | |||||
443 | } | |||||
444 | ||||||
445 | 6 | 26 | return $string; | |||
446 | } | |||||
447 | ||||||
448 | # See https://www.compart.com/en/unicode/U+0161 etc. | |||||
449 | # https://www.compart.com/en/unicode/U+00EB | |||||
450 | sub wide_to_xml | |||||
451 | { | |||||
452 | 25 | 10643 | my $params = Params::Get::get_params('string', @_); | |||
453 | ||||||
454 | 24 | 324 | my $string = $params->{'string'}; | |||
455 | 24 | 22 | my $complain = $params->{'complain'}; | |||
456 | ||||||
457 | 24 | 32 | if(!defined($string)) { | |||
458 | 0 | 0 | my $i = 0; | |||
459 | 0 | 0 | while((my @call_details = caller($i++))) { | |||
460 | 0 | 0 | print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n"; | |||
461 | } | |||||
462 | 0 | 0 | die 'BUG: string not set'; | |||
463 | } | |||||
464 | ||||||
465 | 24 | 33 | if(ref($string) eq 'SCALAR') { | |||
466 | 9 9 | 7 11 | $string = ${$string}; | |||
467 | } | |||||
468 | ||||||
469 | # print STDERR __LINE__, ": ($string)"; | |||||
470 | # print STDERR (sprintf '%v02X', $string); | |||||
471 | # print STDERR "\n"; | |||||
472 | ||||||
473 | # my $i = 0; | |||||
474 | # while((my @call_details = caller($i++))) { | |||||
475 | # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n"; | |||||
476 | # } | |||||
477 | ||||||
478 | 24 | 62 | $string = HTML::Entities::decode($string); | |||
479 | # print STDERR __LINE__, ": ($string)\n"; | |||||
480 | ||||||
481 | # $string =~ s/&/&/g; | |||||
482 | ||||||
483 | # I don't think HTML::Entities does these | |||||
484 | 24 | 39 | my %entity_map = ( | |||
485 | 'č' => 'Ä', | |||||
486 | 'ž' => 'ž', | |||||
487 | 'Š' => 'Å ', | |||||
488 | ); | |||||
489 | ||||||
490 | 24 | 64 | $string =~ s{ | |||
491 | # ([\x80-\x{10FFFF}]) | |||||
492 | (.) | |||||
493 | }{ | |||||
494 | 514 | 434 | my $cp = $1; | |||
495 | exists $entity_map{$cp} | |||||
496 | 514 | 748 | ? $entity_map{$cp} | |||
497 | : $cp | |||||
498 | }gex; | |||||
499 | ||||||
500 | # Escape only if it's not already part of an entity | |||||
501 | 24 | 52 | $string =~ s/&(?![A-Za-z#0-9]+;)/&/g; | |||
502 | ||||||
503 | 24 | 34 | unless($params->{'keep_hrefs'}) { | |||
504 | 22 | 57 | %entity_map = ( | |||
505 | '<' => '<', | |||||
506 | '>' => '>', | |||||
507 | '"' => '"', | |||||
508 | 'â' => '"', # U+201C | |||||
509 | 'â' => '"', # U+201D | |||||
510 | ); | |||||
511 | ||||||
512 | 22 | 30 | $string =~ s{(.)}{ | |||
513 | 458 | 322 | my $cp = $1; | |||
514 | exists $entity_map{$cp} | |||||
515 | 458 | 516 | ? $entity_map{$cp} | |||
516 | : $cp | |||||
517 | }gex; | |||||
518 | } | |||||
519 | ||||||
520 | # $string =~ s/â/'/g; | |||||
521 | # $string =~ s/â/'/g; | |||||
522 | # $string =~ s/â/'/g; | |||||
523 | # $string =~ s/â/'/g; | |||||
524 | # $string =~ s/\x98/'/g; | |||||
525 | # $string =~ s/['âââ\x98]/'/g; | |||||
526 | ||||||
527 | # Table of byte-sequences->entities | |||||
528 | 24 | 93 | my @byte_map = ( | |||
529 | [ "\xe2\x80\x9c", '"' ], # â | |||||
530 | [ "\xe2\x80\x9d", '"' ], # â | |||||
531 | [ 'â', '"' ], # U+201C | |||||
532 | [ 'â', '"' ], # U+201D | |||||
533 | [ "\xe2\x80\x93", '-' ], # ndash | |||||
534 | [ "\xe2\x80\x94", '-' ], # mdash | |||||
535 | [ "\xe2\x80\x98", ''' ], # â | |||||
536 | [ "\xe2\x80\x99", ''' ], # â | |||||
537 | [ "\xe2\x80\xA6", '...' ], # ⦠| |||||
538 | [ "'", ''' ], | |||||
539 | [ 'â', ''' ], | |||||
540 | [ 'â', ''' ], | |||||
541 | [ 'â', ''' ], | |||||
542 | [ "\x98", ''' ], | |||||
543 | ); | |||||
544 | ||||||
545 | 24 | 31 | $string = _sub_map(\$string, \@byte_map); | |||
546 | ||||||
547 | 24 | 183 | %entity_map = ( | |||
548 | '©' => '©', | |||||
549 | 'Á' => 'Á', # Ã | |||||
550 | 'č' => 'č', | |||||
551 | 'à' => 'à', # á | |||||
552 | 'á' => 'á', # á | |||||
553 | 'â' => 'â', # â | |||||
554 | 'ä' => 'ä', # ä | |||||
555 | 'å' => 'å', # Ã¥ | |||||
556 | 'ç' => 'ç', # ç | |||||
557 | 'è' => 'è', | |||||
558 | 'é' => 'é', | |||||
559 | 'ê' => 'ê', | |||||
560 | 'ë' => 'ë', # euml | |||||
561 | 'Î' => 'Î', # Ã | |||||
562 | 'É' => 'É', | |||||
563 | 'ß' => 'ß', # Ã | |||||
564 | 'í' => 'í', # Ã | |||||
565 | 'î' => 'î', | |||||
566 | 'ï' => 'ï', # ï | |||||
567 | 'ð' => 'ð', # ð | |||||
568 | 'ú' => '�FA;', # ú | |||||
569 | 'ü' => 'ü', | |||||
570 | 'š' => 'š', | |||||
571 | 'ó' => 'ó', # ó | |||||
572 | 'û' => 'ô', | |||||
573 | 'ö' => 'ö', | |||||
574 | 'ª' => 'ª', # ª | |||||
575 | 'ø' => 'ø', # ø | |||||
576 | 'ž' => 'ž', | |||||
577 | 'Š' => 'Š', | |||||
578 | 'Þ' => 'Þ', # Ã | |||||
579 | 'þ' => 'þ', # þ | |||||
580 | '®' => '®', | |||||
581 | '£' => '£', | |||||
582 | 'ñ' => 'ñ', | |||||
583 | '—' => '-', | |||||
584 | '–' => '-', | |||||
585 | '!' => '!', | |||||
586 | ); | |||||
587 | ||||||
588 | 24 | 42 | $string =~ s{(.)}{ | |||
589 | 611 | 340 | my $cp = $1; | |||
590 | exists $entity_map{$cp} | |||||
591 | 611 | 540 | ? $entity_map{$cp} | |||
592 | : $cp | |||||
593 | }gex; | |||||
594 | ||||||
595 | 24 | 45 | if($string !~ /[^[:ascii:]]/) { | |||
596 | 18 | 89 | return $string; | |||
597 | } | |||||
598 | ||||||
599 | # print STDERR __LINE__, ": ($string)"; | |||||
600 | # print STDERR (sprintf '%v02X', $string); | |||||
601 | # print STDERR "\n"; | |||||
602 | ||||||
603 | @byte_map = ( | |||||
604 | 6 | 80 | ["\xc2\xa0", ' '], # Non breaking space | |||
605 | ["\xc2\xa3", '£'], # £ | |||||
606 | ["\xc2\xa9", '©'], | |||||
607 | ["\xc2\xaa", 'ª'], # ª | |||||
608 | ["\xc2\xab", '"'], # « | |||||
609 | ["\xc2\xae", '®'], | |||||
610 | ["\xc3\x81", 'Á'], # Ã | |||||
611 | ["\xc3\x8e", 'Î'], # Ã | |||||
612 | ["\xc3\xa0", 'à'], # Ã | |||||
613 | ["\xc3\xa1", 'á'], # á | |||||
614 | ["\xc3\xa5", 'å'], # Ã¥ | |||||
615 | ["\xc3\xa9", 'é'], | |||||
616 | ["\xc3\xaf", 'ï'], # ï | |||||
617 | ["\xc3\xb1", 'ñ'], # ntilde ñ | |||||
618 | ["\xc5\xa1", 'š'], | |||||
619 | ["\xc4\x8d", 'č'], | |||||
620 | ["\xc5\xbe", 'ž'], # ž | |||||
621 | ["\xc3\x96", 'Ö'], # Ã | |||||
622 | ["\xc3\x9e", 'Þ'], # Ã | |||||
623 | ["\xc3\x9f", 'ß'], # Ã | |||||
624 | ["\xc3\xa2", 'â'], # â | |||||
625 | ["\xc3\xad", 'í'], # Ã | |||||
626 | ["\xc3\xa4", 'ä'], # ä | |||||
627 | ["\xc3\xa7", 'ç'], # ç | |||||
628 | ["\xc3\xb0", 'ð'], # ð | |||||
629 | ["\xc3\xb3", 'ó'], # ó | |||||
630 | ["\xc3\xb8", 'ø'], # ø | |||||
631 | ["\xc3\xbc", 'ü'], # ü | |||||
632 | ["\xc3\xbe", 'þ'], # þ | |||||
633 | ["\xc3\xa8", 'è'], # è | |||||
634 | ["\xc3\xee", 'î'], | |||||
635 | ["\xc3\xb4", 'ô'], # ô | |||||
636 | ["\xc3\xb6", 'ö'], # ö | |||||
637 | ["\xc3\x89", 'É'], | |||||
638 | ["\xc3\xaa", 'ê'], | |||||
639 | ["\xc3\xab", 'ë'], # eumlaut | |||||
640 | ["\xc3\xba", 'ú'], # ú | |||||
641 | ["\xc3\xbb", '»'], # û - ucirc | |||||
642 | ["\xc5\x9b", 'ś'], # Å - sacute | |||||
643 | ["\xc5\xa0", 'Š'], | |||||
644 | ["\xe2\x80\x93", '-'], | |||||
645 | ["\xe2\x80\x94", '-'], | |||||
646 | ["\xe2\x80\x9c", '"'], | |||||
647 | ["\xe2\x80\x9d", '"'], | |||||
648 | ["\xe2\x80\xa6", '...'], | |||||
649 | ["\xe2\x97\x8f", '●'], # â | |||||
650 | ["\xe3\xb1", 'ñ'], # ntilde ñ - what's this one? | |||||
651 | ||||||
652 | # $string =~ s/\xe4\x8d/č/g; # ? ACOM strangeness | |||||
653 | # $string =~ s/\N{U+0161}/š/g; | |||||
654 | # $string =~ s/\N{U+010D}/č/g; | |||||
655 | # $string =~ s/\N{U+00E9}/é/g; | |||||
656 | # $string =~ s/\N{U+017E}/ž/g; | |||||
657 | ||||||
658 | ["\N{U+00A0}", ' '], | |||||
659 | ["\N{U+010D}", 'č'], | |||||
660 | ["\N{U+00AB}", '"'], # « | |||||
661 | ["\N{U+00AE}", '®'], # ® | |||||
662 | ["\N{U+00B5}", 'µ'], # µ | |||||
663 | ["\N{U+00C1}", 'Á'], # Ã | |||||
664 | ["\N{U+00CE}", 'Î'], # Ã | |||||
665 | ["\N{U+00DE}", 'Þ'], # Ã | |||||
666 | ["\N{U+00E4}", 'ä'], # ä | |||||
667 | ["\N{U+00E5}", 'å'], # Ã¥ | |||||
668 | ["\N{U+00EA}", 'ê'], | |||||
669 | ["\N{U+00ED}", 'í'], | |||||
670 | ["\N{U+00EE}", 'î'], | |||||
671 | ["\N{U+00FE}", 'þ'], # þ | |||||
672 | ["\N{U+00C9}", 'É'], | |||||
673 | ["\N{U+017E}", 'ž'], # ž | |||||
674 | ["\N{U+00D6}", 'Ö'], # Ã | |||||
675 | ["\N{U+00DF}", 'ß'], # Ã | |||||
676 | ["\N{U+00E1}", 'á'], # á - aacute | |||||
677 | ["\N{U+00E2}", 'â'], | |||||
678 | ["\N{U+00E8}", 'è'], # è | |||||
679 | ["\N{U+00EF}", 'ï'], # ï | |||||
680 | ["\N{U+00F0}", 'ð'], # ð | |||||
681 | ["\N{U+00F1}", 'ñ'], # ñ | |||||
682 | ["\N{U+00F3}", 'ó'], # ó | |||||
683 | ["\N{U+00F4}", 'ô'], # ô | |||||
684 | ["\N{U+00F6}", 'ö'], # ö | |||||
685 | ["\N{U+00F8}", 'ø'], # ø | |||||
686 | ["\N{U+00FA}", 'ú'], # ú | |||||
687 | ["\N{U+00FC}", 'ü'], # ü | |||||
688 | ["\N{U+015B}", 'ś'], # Å | |||||
689 | # print STDERR __LINE__, ": ($string)"; | |||||
690 | # print STDERR (sprintf '%v02X', $string); | |||||
691 | # print STDERR "\n"; | |||||
692 | ["\N{U+00E9}", 'é'], | |||||
693 | # print STDERR __LINE__, ": ($string)"; | |||||
694 | # print STDERR (sprintf '%v02X', $string); | |||||
695 | # print STDERR "\n"; | |||||
696 | ["\N{U+00E7}", 'ç'], # ç | |||||
697 | ["\N{U+00EB}", 'ë'], # ë | |||||
698 | ["\N{U+00FB}", 'û'], # û | |||||
699 | ["\N{U+0160}", 'Š'], | |||||
700 | ["\N{U+0161}", 'š'], | |||||
701 | ["\N{U+00A9}", '©'], # © | |||||
702 | # print STDERR __LINE__, ": ($string)"; | |||||
703 | # print STDERR (sprintf '%v02X', $string); | |||||
704 | # print STDERR "\n"; | |||||
705 | ["\N{U+2013}", '-'], | |||||
706 | ["\N{U+2014}", '-'], | |||||
707 | ["\N{U+2018}", '"'], | |||||
708 | ["\N{U+2019}", '"'], | |||||
709 | ["\N{U+201C}", '"'], | |||||
710 | ["\N{U+201D}", '"'], | |||||
711 | ["\N{U+2026}", '...'], # ⦠| |||||
712 | ["\N{U+25CF}", '●'], # â | |||||
713 | ); | |||||
714 | ||||||
715 | 6 | 6 | $string = _sub_map(\$string, \@byte_map); | |||
716 | ||||||
717 | # utf8::encode($string); | |||||
718 | # $string =~ s/Å¡/&s#x161;/g; | |||||
719 | # $string =~ s/Ä/č/g; | |||||
720 | # $string =~ s/ž/&z#x17E;/g; | |||||
721 | # $string =~ s/é/é/g; | |||||
722 | # $string =~ s/Ã/Ö/g; | |||||
723 | # $string =~ s/ç/ç/g; | |||||
724 | # $string =~ s/\N{U+0161}/š/g; | |||||
725 | # $string =~ s/\N{U+010D}/č/g; | |||||
726 | # $string =~ s/\N{U+017E}/ž/g; | |||||
727 | # $string =~ s/\N{U+00E9}/é/g; | |||||
728 | # $string =~ s/\N{U+00D6}/Ö/g; # Ã | |||||
729 | # $string =~ s/\N{U+00E7}/ç/g; # ç | |||||
730 | ||||||
731 | # print STDERR __LINE__, ": ($string)"; | |||||
732 | # print STDERR (sprintf '%v02X', $string); | |||||
733 | # print STDERR "\n"; | |||||
734 | ||||||
735 | # utf8::decode($string); | |||||
736 | ||||||
737 | # $string =~ s/['\x98]/'/g; | |||||
738 | 6 | 68 | @byte_map = ( | |||
739 | ["'", '''], | |||||
740 | ["\x98", '''], | |||||
741 | ['©', '©'], | |||||
742 | ['ª', 'ª'], | |||||
743 | ['®', '®'], | |||||
744 | ['Ã¥', 'å'], | |||||
745 | ['Å¡', 'š'], | |||||
746 | ['Ä', 'č'], | |||||
747 | ['ž', 'ž'], | |||||
748 | ['£', '£'], | |||||
749 | ['µ', 'µ'], | |||||
750 | ['á', 'á'], # á | |||||
751 | ['â', 'â'], | |||||
752 | ['ä', 'ä'], # ä | |||||
753 | ['Ã', 'Á'], # Ã | |||||
754 | ['Ã', 'Ö'], | |||||
755 | ['Ã', 'ß'], | |||||
756 | ['ç', 'ç'], | |||||
757 | ['è', 'è'], | |||||
758 | ['é', 'é'], | |||||
759 | ['ê', 'ê'], | |||||
760 | ['ë', 'ë'], | |||||
761 | ['Ã', 'í'], | |||||
762 | ['ï', 'ï'], | |||||
763 | ['Ã', 'Î'], # Ã | |||||
764 | ['Ã', 'Þ'], # Ã | |||||
765 | ['ð', 'ð'], # ð | |||||
766 | ['ø', 'ø'], # ø | |||||
767 | ['û', 'û'], | |||||
768 | ['ñ', 'ñ'], | |||||
769 | ['ú', 'ú'], | |||||
770 | ['ü', 'ü'], | |||||
771 | ['þ', 'þ'], # þ | |||||
772 | ['â', '"'], | |||||
773 | ['â', '"'], | |||||
774 | ['«', '"'], | |||||
775 | ['»', '"'], | |||||
776 | ['â', '-'], | |||||
777 | ['â', '-'], | |||||
778 | ['â¦', '...'], | |||||
779 | ['â', '●'], | |||||
780 | ["\x80\$", ' '], | |||||
781 | ); | |||||
782 | ||||||
783 | 6 | 7 | $string = _sub_map(\$string, \@byte_map); | |||
784 | ||||||
785 | # if($string =~ /^Maria\(/) { | |||||
786 | # print STDERR (unpack 'H*', $string); | |||||
787 | # print STDERR __LINE__, ': '; | |||||
788 | # print STDERR (sprintf '%v02X', $string); | |||||
789 | # print STDERR "\n"; | |||||
790 | # my $i = 0; | |||||
791 | # while((my @call_details = caller($i++))) { | |||||
792 | # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n"; | |||||
793 | # } | |||||
794 | # die $string; | |||||
795 | # } | |||||
796 | ||||||
797 | # print STDERR __LINE__, ": ($string)\n"; | |||||
798 | 6 | 11 | if($string =~ /[^[:ascii:]]/) { | |||
799 | 1 | 10 | print STDERR (unpack 'H*', $string); | |||
800 | 1 | 3 | print STDERR __LINE__, ': '; | |||
801 | 1 | 3 | print STDERR (sprintf '%v02X', $string); | |||
802 | 1 | 2 | print STDERR "\n"; | |||
803 | 1 | 1 | my $i = 0; | |||
804 | 1 | 4 | while((my @call_details = caller($i++))) { | |||
805 | 2 | 48 | print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n"; | |||
806 | } | |||||
807 | 1 | 30 | $complain->("TODO: wide_to_xml($string)") if($complain); | |||
808 | 1 | 6 | warn "TODO: wide_to_xml($string)"; | |||
809 | # $string =~ s/[^[:ascii:]]/XXXXX/g; | |||||
810 | 1 | 231 | $string =~ s{ | |||
811 | ([^[:ascii:]]) | |||||
812 | }{ | |||||
813 | 2 | 3 | '>>>>' . sprintf("%04X", ord($1)) . '<<<<' | |||
814 | }gex; # e=evaluate, g=global, x=extended | |||||
815 | 1 | 4 | die "BUG: wide_to_xml($string)"; | |||
816 | } | |||||
817 | 5 | 33 | return $string; | |||
818 | } | |||||
819 | ||||||
820 | sub _sub_map | |||||
821 | { | |||||
822 | 72 72 | 42 56 | my $string = ${$_[0]}; | |||
823 | 72 | 53 | my $byte_map = $_[1]; | |||
824 | ||||||
825 | # Build an alternation sorted by longest sequence first | |||||
826 | my $pattern = join '|', | |||||
827 | 2226 | 1457 | map { quotemeta($_->[0]) } | |||
828 | 3738 | 2000 | sort { length $b->[0] <=> length $a->[0] } | |||
829 | 72 72 | 41 107 | @{$byte_map}; | |||
830 | ||||||
831 | 72 78 | 2557 40 | $string =~ s/($pattern)/do { | |||
832 | 78 | 56 | my $bytes = $1; | |||
833 | 78 3762 78 | 31 2072 56 | my ($pair) = grep { $_->[0] eq $bytes } @{$byte_map}; | |||
834 | 78 | 105 | $pair->[1]; | |||
835 | }/ge; | |||||
836 | ||||||
837 | 72 | 157 | return $string; | |||
838 | } | |||||
839 | ||||||
840 - 890 | =head1 SEE ALSO =over 4 =item * Test coverage report: L<https://nigelhorne.github.io/Encode-Wide/coverage/> =item * L<HTML::Entities> =item * L<Encode> =item * L<XML::Entities> =item * L<Unicode::Escape> =item * L<https://www.compart.com/en/unicode/> =back =head1 SUPPORT This module is provided as-is without any warranty. Please report any bugs or feature requests to C<bug-encode-wide at rt.cpan.org>, or through the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Encode-Wide>. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head1 AUTHOR Nigel Horne, C<< <njh at nigelhorne.com> >> =head1 LICENCE AND COPYRIGHT Copyright 2025 Nigel Horne. Usage is subject to licence terms. The licence terms of this software are as follows: =over 4 =item * Personal single user, single computer use: GPL2 =item * All other users (including Commercial, Charity, Educational, Government) must apply in writing for a licence for use from Nigel Horne at the above e-mail. =back =cut | |||||
891 | ||||||
892 | 1; |