File Coverage

File:blib/lib/Encode/Wide.pm
Coverage:84.4%

linestmtbrancondsubtimecode
1package Encode::Wide;
2
3# TODO: don't transform anything within <script>...</script> in wide_to_html
4
5
3
3
3
66054
3
47
use strict;
6
3
3
3
4
2
53
use warnings;
7
8
3
3
3
3
3
34
use Exporter qw(import);
9
3
3
3
509
6771
130
use HTML::Entities;
10
3
3
3
495
15621
58
use Params::Get 0.13;
11
3
3
3
761
11461
3909
use Term::ANSIColor;
12
13our @EXPORT_OK = qw(wide_to_html wide_to_xml);
14
15# Encode to HTML whatever the non-ASCII encoding scheme has been chosen
16# Can't use HTML:Entities::encode since that doesn't seem to cope with
17#       all encodings and misses some characters
18#
19# See https://www.compart.com/en/unicode/U+0161 etc.
20#       https://www.compart.com/en/unicode/U+00EB
21#
22# keep_hrefs => 1 means ensure hyperlinks still work
23# keep_apos => 1 means keep apostrophes, useful within <script>
24
25=encoding UTF-8
26
27 - 35
=head1 NAME

Encode::Wide - Convert wide characters (Unicode, UTF-8, etc.) into HTML or XML-safe ASCII entities

=head1 VERSION

0.06

=cut
36
37our $VERSION = 0.06;
38
39 - 104
=head1 SYNOPSIS

    use Encode::Wide qw(wide_to_html wide_to_xml);

    my $html = wide_to_html(string => "Café déjà vu – naïve façade");
    # returns: 'Caf&eacute; d&eacute;j&agrave; vu &ndash; na&iuml;ve fa&ccedil;ade'

    my $xml = wide_to_xml(string => "Café déjà vu – naïve façade");
    # returns: 'Caf&#xE9; d&#xE9;j&#xE0; vu &#x2013; na&#xEF;ve fa&#xE7;ade'

=head1 DESCRIPTION

Encode::Wide provides functions for converting wide (Unicode) characters into ASCII-safe
formats suitable for embedding in HTML or XML documents.
It is especially useful when dealing with text containing accented or typographic characters that need
to be safely represented in markup.

Other modules exist to do this,
however they tend to have assumptions on the input,
whereas this should work with UTF-8, Unicode, or anything that's common.

The module offers two exportable functions:

=over 4

=item * C<wide_to_html(string => $text)>

Converts all non-ASCII characters in the input string to their named HTML entities if available,
or hexadecimal numeric entities otherwise.
Common characters such as `é`, `à`, `&`, `<`, `>` are
converted to their standard HTML representations like `&eacute;`, `&agrave;`, `&amp;`, etc.

=item * C<wide_to_xml(string => $text)>

Converts all non-ASCII characters in the input string to hexadecimal numeric entities.
Unlike HTML, XML does not support many named entities, so this function ensures compliance
by using numeric representations such as `&#xE9;` for `é`.

=back

=head1 PARAMETERS

Both functions accept a named parameter:

=over 4

=item * C<string> — The Unicode string to convert.

=back

=head1 ENCODING

Input strings are expected to be valid UTF-8 or Unicode.
If a byte string is passed, the module will attempt to decode it appropriately.
Output is guaranteed to be pure ASCII.

=head1 EXPORT

None by default.

Optionally exportable:

    wide_to_html
    wide_to_xml

=cut
105
106sub wide_to_html
107{
108
25
288149
        my $params = Params::Get::get_params('string', @_);
109
110
24
327
        my $string = $params->{'string'};
111
24
24
        my $complain = $params->{'complain'};
112
113
24
38
        if(!defined($string)) {
114
0
0
                my $i = 0;
115
0
0
                while((my @call_details = caller($i++))) {
116
0
0
                        print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
117                }
118
0
0
                die 'Usage: wide_to_html() string not set';
119        }
120
121
24
34
        if(ref($string) eq 'SCALAR') {
122
9
9
9
9
                $string = ${$string};
123        }
124
125        # print STDERR __LINE__, ": ($string)";
126        # print STDERR (sprintf '%v02X', $string);
127        # print STDERR "\n";
128        # my $i = 0;
129        # while((my @call_details = caller($i++))) {
130                # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
131        # }
132
133
24
65
        $string = HTML::Entities::decode($string);
134        # $string =~ s/ & / &amp; /g;
135
136        # I don't think HTML::Entities does these
137
24
47
        my %entity_map = (
138                '&ccaron;' => 'č',
139                '&zcaron;' => 'ž',
140                '&Scaron;' => 'Å ',
141        );
142
143
24
61
        $string =~ s{
144                # ([\x80-\x{10FFFF}])
145                (.)
146        }{
147
514
454
                my $cp = $1;
148                exists $entity_map{$cp}
149
514
754
                        ? $entity_map{$cp}
150                        : $cp
151        }gex;
152
153        # Escape only if it's not already part of an entity
154
24
54
        $string =~ s/&(?![A-Za-z#0-9]+;)/&amp;/g;
155
156
24
31
        unless($params->{'keep_hrefs'}) {
157
22
46
                %entity_map = (
158                        '<' => '&lt;',
159                        '>' => '&gt;',
160                        '"' => '&quot;',
161                );
162
22
32
                $string =~ s{(.)}{
163
458
338
                        my $cp = $1;
164                        exists $entity_map{$cp}
165
458
544
                                ? $entity_map{$cp}
166                                : $cp
167                }gex;
168        }
169
170        # $string =~ s/&db=/&amp;db=/g;
171        # $string =~ s/&id=/&amp;id=/g;
172
173        # Table of byte-sequences->entities
174
24
82
        my @byte_map = (
175                ['“', '&quot;'],  # U+201C
176                ['”', '&quot;'],  # U+201D
177                ["\xe2\x80\x9c", '&quot;'],       # “
178                ["\xe2\x80\x9d", '&quot;'],       # ”
179                ["\xe2\x80\x93", '&ndash;'],
180                ["\xe2\x80\x94", '&mdash;'],
181                ["\xe2\x80\x98", '&apos;'],       # ‘
182                ["\xe2\x80\x99", '&apos;'],       # ’
183                ["\xe2\x80\xA6", '...'],      # …
184                ['!', '&excl;'],    # Do this early before the ascii check, since it's an ascii character
185        );
186
187
24
29
        $string = _sub_map(\$string, \@byte_map);
188
189
24
28
        unless($params->{'keep_apos'}) {
190                # We can't combine since each char in the multi-byte matches, not the entire multi-byte
191                # $string =~ s/['‘’‘\x98]/&apos;/g;
192
22
52
                %entity_map = (
193                        "'" => '&apos;',
194                        '‘' => '&apos;',
195                        '’' => '&apos;',
196                        '‘' => '&apos;',
197                        "\x98" => '&apos;',
198                );
199
200
22
25
                $string =~ s{
201                        # ([\x80-\x{10FFFF}])
202                        (.)
203                }{
204
571
329
                        my $cp = $1;
205                        exists $entity_map{$cp}
206
571
506
                                ? $entity_map{$cp}
207                                : $cp
208                }gex;
209        }
210
211
24
45
        if($string !~ /[^[:ascii:]]/) {
212
18
60
                return $string;
213        }
214
215        @byte_map = (
216
6
106
                ["\xc2\xa0", ' '],    # Non breaking space
217                ["\xc2\xa3", '&pound;'],
218                ["\xc2\xa9", '&copy;'],
219                ["\xc2\xae", '&reg;'],
220                ["\xc3\xa2", '&acirc;'],
221                ["\xc3\xa4", '&auml;'],
222                ["\xc3\xa9", '&eacute;'],
223                ["\xc2\xaa", '&ordf;'],   # ª
224                ["\xc2\xab", '&quot;'],   # «
225                ["\xc2\xbb", '&quot;'],   # »
226                ["\xc3\x81", '&Aacute;'], # Á
227                ["\xc3\x83", '&Icirc;'],  # ÃŽ
228                ["\xc3\x9e", '&THORN;'],  # Þ
229                ["\xc3\xa0", '&agrave;'], # à
230                ["\xc3\xa1", '&aacute;'], # á
231                ["\xc3\xad", '&iacute;'], # í
232                ["\xc3\xb0", '&eth;'],    # ð
233                ["\xc3\xba", '&uacute;'], # ú
234                ["\xc3\xb4", '&ocirc;'],  # ô
235                ["\xc3\xb6", '&ouml;'],
236                ["\xc3\xb8", '&oslash;'], # ø
237                ["\xc5\xa1", '&scaron;'],
238                ["\xc4\x8d", '&ccaron;'],
239                ["\xc5\xbe", '&zcaron;'],
240                ["\xc3\xa5", '&aring;'],  # Ã¥
241                ["\xc3\xa7", '&ccedil;'],
242                ["\xc3\xaf", '&iuml;'],   # ï
243                ["\xc3\xb3", '&oacute;'],
244                ["\xc3\x96", '&Ouml;'], # Ö
245                ["\xc3\xa8", '&egrave;'],
246                ["\xc3\x89", '&Eacute;'],
247                ["\xc3\x9f", '&szlig;'],
248                ["\xc3\xaa", '&ecirc;'],
249                ["\xc3\xab", '&euml;'],
250                ["\xc3\xae", '&icirc;'],
251                ["\xc3\xbb", '&ucirc;'],
252                ["\xc3\xbc", '&uuml;'], # ü
253                ["\xc3\xbe", '&thorn;'],  # þ
254                ["\xc5\x9b", '&sacute;'],
255                ["\xc5\xa0", '&Scaron;'],
256                ["\xe2\x80\x93", '&ndash;'],
257                ["\xe2\x80\x94", '&mdash;'],
258                ["\xc3\xb1", '&ntilde;'], # ñ
259                ["\xe2\x80\x9c", '&quot;'],
260                ["\xe2\x80\x9d", '&quot;'],
261                ["\xe2\x80\xa6", '...'],
262                ["\xe2\x97\x8f", '&#x25CF;'],     # ●
263                ["\N{U+00A0}", ' '],
264                ["\N{U+00A3}", '&pound;'],
265                ["\N{U+00A9}", '&copy;'],
266                ["\N{U+00AA}", '&ordf;'], # ª
267                ["\N{U+00AB}", '&quot;'], # «
268                ["\N{U+00AE}", '&reg;'],
269                ["\N{U+00B5}", '&micro;'],        # µ
270                ["\N{U+00BB}", '&quot;'], # »
271                ["\N{U+00CE}", '&Icirc;'],        # ÃŽ
272                ["\N{U+00DE}", '&THORN;'],        # Þ
273                ["\N{U+0161}", '&scaron;'],
274                ["\N{U+010D}", '&ccaron;'],
275                ["\N{U+017E}", '&zcaron;'],
276                ["\N{U+00C9}", '&Eacute;'],
277                ["\N{U+00D6}", '&Ouml;'], # Ö
278                ["\N{U+00DF}", '&szlig;'],        # ß
279                ["\N{U+00E1}", '&aacute;'],       # á
280                ["\N{U+00E2}", '&acirc;'],
281                ["\N{U+00E4}", '&auml;'],
282                ["\N{U+00E5}", '&aring;'],        # Ã¥
283                ["\N{U+00E7}", '&ccedil;'],       # ç
284                ["\N{U+00E8}", '&egrave;'],
285                ["\N{U+00E9}", '&eacute;'],
286                ["\N{U+00ED}", '&iacute;'],       # í
287                ["\N{U+00EE}", '&icirc;'],
288                ["\N{U+00EF}", '&iuml;'], # ï
289                ["\N{U+00F0}", '&eth;'],  # ð
290                ["\N{U+00F1}", '&ntilde;'],       # ñ
291                ["\N{U+00F4}", '&ocirc;'],        # ô
292                ["\N{U+00F6}", '&ouml;'],
293                ["\N{U+00F8}", '&oslash;'],       # ø
294                ["\N{U+00FA}", '&uacute;'],       # ú
295                ["\N{U+00FC}", '&uuml;'], # ü
296                ["\N{U+00FE}", '&thorn;'],        # þ
297                ["\N{U+00C1}", '&Aacute;'],       # Á
298                ["\N{U+00C9}", '&Eacute;'],
299                ["\N{U+00CA}", '&ecirc;'],
300                ["\N{U+00EB}", '&euml;'],
301                ["\N{U+00F3}", '&oacute;'],
302                ["\N{U+015B}", '&sacute;'],
303                ["\N{U+00FB}", '&ucirc;'],
304                ["\N{U+0160}", '&Scaron;'],
305                ["\N{U+2013}", '&ndash;'],
306                ["\N{U+2014}", '&mdash;'],
307                ["\N{U+2018}", '&quot;'],
308                ["\N{U+2019}", '&quot;'],
309                ["\N{U+201C}", '&quot;'],
310                ["\N{U+201D}", '&quot;'],
311                ["\N{U+2026}", '...'],        # …
312                ["\N{U+25CF}", '&#x25CF;'],       # ●
313        );
314
315
6
8
        $string = _sub_map(\$string, \@byte_map);
316
317        # utf8::encode($string);
318        # $string =~ s/Å¡/&scaron;/g;
319        # $string =~ s/č/&ccaron;/g;
320        # $string =~ s/ž/&zcaron;/g;
321        # $string =~ s/é/&eacute;/g;
322        # $string =~ s/ç/&ccedil;/g;
323        # $string =~ s/\N{U+0161}/&scaron;/g;
324        # $string =~ s/\N{U+010D}/&ccaron;/g;
325        # $string =~ s/\N{U+017E}/&zcaron;/g;
326        # $string =~ s/\N{U+00E9}/&eacute;/g;
327        # $string =~ s/\N{U+00D6}/&Ouml;/g; # Ö
328        # $string =~ s/\N{U+00E7}/&ccedil;/g;       # ç
329        # $string =~ s/\N{U+00E8}/&egrave;/g;
330        # $string =~ s/\N{U+00E9}/&Eacute;/g;
331
332        # print STDERR __LINE__, ": ($string)";
333        # print STDERR (sprintf '%v02X', $string);
334        # print STDERR "\n";
335
336        # utf8::decode($string);
337
338        # print STDERR __LINE__, ": ($string)";
339        # # print STDERR (sprintf '%v02X', $string);
340        # print STDERR "\n";
341
342        # $string =~ s/\xe4\x8d/&ccaron;/g; # ? ACOM strangeness
343        # $string =~ s/\N{U+0161}/&scaron;/g;
344        # $string =~ s/\N{U+010D}/&ccaron;/g;
345        # $string =~ s/\N{U+00A9}/&copy;/g;
346        # $string =~ s/\N{U+00AE}/&reg;/g;
347        # $string =~ s/\N{U+00E2}/&acirc;/g;
348        # $string =~ s/\N{U+00E4}/&auml;/g;
349        # $string =~ s/\N{U+00E8}/&egrave;/g;
350        # $string =~ s/\N{U+00E9}/&eacute;/g;
351        # $string =~ s/\N{U+00EB}/&euml;/g;
352        # $string =~ s/\N{U+00F3}/&oacute;/g;
353        # $string =~ s/\N{U+00FB}/&ucirc;/g;
354        # $string =~ s/\N{U+017E}/&zcaron;/g;
355        # $string =~ s/\N{U+00D6}/&Ouml;/g; # Ö
356        # $string =~ s/\N{U+00E7}/&ccedil;/g;       # ç
357        # $string =~ s/\N{U+00C9}/&Eacute;/g;
358        # $string =~ s/\N{U+00CA}/&ecirc;/g;
359        # $string =~ s/\N{U+0160}/&Scaron;/g;       # FIXME: also above
360        # $string =~ s/\N{U+2013}/-/g;
361
362
6
73
        @byte_map = (
363                [ 'Á', '&Aacute;' ],
364                [ 'Ã¥', '&aring;' ],
365                [ 'ª', '&ordf;' ],
366                [ 'Å¡', '&scaron;' ],
367                [ 'Å ', '&Scaron;' ],
368                [ 'č', '&ccaron;' ],
369                [ 'ž', '&zcaron;' ],
370                [ 'á', '&aacute;' ],
371                [ 'â', '&acirc;' ],
372                [ 'é', '&eacute;' ],
373                [ 'è', '&egrave;' ],
374                [ 'ç', '&ccedil;' ],
375                [ 'ê', '&ecirc;' ],
376                [ 'ë', '&euml;' ],
377                [ 'ð', '&eth;' ],
378                [ 'í', '&iacute;' ],
379                [ 'ï', '&iuml;' ],
380                [ 'ÃŽ', '&Iicrc;' ],
381                [ '©', '&copy;' ],
382                [ '®', '&reg;' ],
383                [ 'ó', '&oacute;' ],
384                [ 'ô', '&ocirc;' ],
385                [ 'ö', '&ouml;' ],
386                [ 'ø', '&oslash;' ],
387                [ 'Å›', '&sacute;' ],
388                [ 'Þ', '&THORN;' ],
389                [ 'þ', '&thorn;' ],
390                [ 'û', '&ucirc;' ],
391                [ 'ü', '&uuml;' ],
392                [ 'ú', '&uacute;' ],
393                [ 'µ', '&micro;'],
394                [ '£', '&pound;' ],
395                [ 'ß', '&szlig;' ],
396                [ '–', '&ndash;' ],
397                [ '—', '&mdash;' ],
398                [ 'ñ', '&ntilde;' ],
399                [ '“', '&quot;' ],
400                [ '”', '&quot;' ],
401                [ '«', '&quot;' ],
402                [ '»', '&quot;' ],
403                [ '…', '...' ],
404                [ '●', '&#x25CF;' ],
405                [ "\x80\$", ' ' ],
406        );
407
408
6
8
        $string = _sub_map(\$string, \@byte_map);
409
410        # if($string =~ /^Maria\(/) {
411                # # print STDERR (unpack 'H*', $string);
412                # print STDERR __LINE__, ': ';
413                # print STDERR (sprintf '%v02X', $string);
414                # print STDERR "\n";
415                # my $i = 0;
416                # while((my @call_details = caller($i++))) {
417                        # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
418                # }
419                # # die $string;
420        # }
421
422        # print STDERR __LINE__, ": ($string)\n";
423
6
10
        if($string =~ /[^[:ascii:]]/) {
424
1
2
                $string = HTML::Entities::encode_entities_numeric($string, '\x80-\x{10FFFF}');
425
1
39
                if($string =~ /[^[:ascii:]]/) {
426
0
0
                        print STDERR (unpack 'H*', $string);
427
0
0
                        print STDERR __LINE__, ': ';
428
0
0
                        print STDERR (sprintf '%v02X', $string), "\n";
429
0
0
                        my $i = 0;
430
0
0
                        while((my @call_details = caller($i++))) {
431
0
0
                                print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
432                        }
433
0
0
                        $complain->("TODO: wide_to_html($string)") if($complain);
434
0
0
                        warn "TODO: wide_to_html($string)";
435                        # $string =~ s/[^[:ascii:]]/XXXXX/g;
436
0
0
                        $string =~ s{
437                                        ([^[:ascii:]])
438                                }{
439
0
0
                                        '>>>>' . sprintf("%04X", ord($1)) . '<<<<'
440                                }gex;   # e=evaluate, g=global, x=extended
441
0
0
                        die "BUG: wide_to_html($string)";
442                }
443        }
444
445
6
28
        return $string;
446}
447
448# See https://www.compart.com/en/unicode/U+0161 etc.
449#       https://www.compart.com/en/unicode/U+00EB
450sub wide_to_xml
451{
452
25
10701
        my $params = Params::Get::get_params('string', @_);
453
454
24
322
        my $string = $params->{'string'};
455
24
21
        my $complain = $params->{'complain'};
456
457
24
39
        if(!defined($string)) {
458
0
0
                my $i = 0;
459
0
0
                while((my @call_details = caller($i++))) {
460
0
0
                        print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
461                }
462
0
0
                die 'Usage: string not set';
463        }
464
465
24
31
        if(ref($string) eq 'SCALAR') {
466
9
9
7
10
                $string = ${$string};
467        }
468
469        # print STDERR __LINE__, ": ($string)";
470        # print STDERR (sprintf '%v02X', $string);
471        # print STDERR "\n";
472
473        # my $i = 0;
474        # while((my @call_details = caller($i++))) {
475                # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
476        # }
477
478
24
60
        $string = HTML::Entities::decode($string);
479        # print STDERR __LINE__, ": ($string)\n";
480
481        # $string =~ s/&amp;/&/g;
482
483        # I don't think HTML::Entities does these
484
24
41
        my %entity_map = (
485                '&ccaron;' => 'č',
486                '&zcaron;' => 'ž',
487                '&Scaron;' => 'Å ',
488        );
489
490
24
64
        $string =~ s{
491                # ([\x80-\x{10FFFF}])
492                (.)
493        }{
494
514
448
                my $cp = $1;
495                exists $entity_map{$cp}
496
514
749
                        ? $entity_map{$cp}
497                        : $cp
498        }gex;
499
500        # Escape only if it's not already part of an entity
501
24
50
        $string =~ s/&(?![A-Za-z#0-9]+;)/&amp;/g;
502
503
24
36
        unless($params->{'keep_hrefs'}) {
504
22
53
                %entity_map = (
505                        '<' => '&lt;',
506                        '>' => '&gt;',
507                        '"' => '&quot;',
508                        '“' => '&quot;',       # U+201C
509                        '”' => '&quot;',       # U+201D
510                );
511
512
22
34
                $string =~ s{(.)}{
513
458
313
                        my $cp = $1;
514                        exists $entity_map{$cp}
515
458
547
                                ? $entity_map{$cp}
516                                : $cp
517                }gex;
518        }
519
520        # $string =~ s/‘/&apos;/g;
521        # $string =~ s/’/&apos;/g;
522        # $string =~ s/‘/&apos;/g;
523        # $string =~ s/‘/&apos;/g;
524        # $string =~ s/\x98/&apos;/g;
525        # $string =~ s/['‘’‘\x98]/&apos;/g;
526
527        # Table of byte-sequences->entities
528
24
95
        my @byte_map = (
529                [ "\xe2\x80\x9c", '&quot;' ],     # “
530                [ "\xe2\x80\x9d", '&quot;' ],     # ”
531                [ '“', '&quot;' ],        # U+201C
532                [ '”', '&quot;' ],        # U+201D
533                [ "\xe2\x80\x93", '-' ],      # ndash
534                [ "\xe2\x80\x94", '-' ],      # mdash
535                [ "\xe2\x80\x98", '&apos;' ],     # ‘
536                [ "\xe2\x80\x99", '&apos;' ],     # ’
537                [ "\xe2\x80\xA6", '...' ],    # …
538                [ "'", '&apos;' ],
539                [ '‘', '&apos;' ],
540                [ '’', '&apos;' ],
541                [ '‘', '&apos;' ],
542                [ "\x98", '&apos;' ],
543        );
544
545
24
28
        $string = _sub_map(\$string, \@byte_map);
546
547
24
187
        %entity_map = (
548                '&copy;' => '&#x0A9;',
549                '&Aacute;' => '&#x0C1;',     # Á
550                '&ccaron;' => '&#x10D;',
551                '&agrave;' => '&#x0E0;',     # á
552                '&aacute;' => '&#x0E1;',     # á
553                '&acirc;' => '&#x0E2;',              # â
554                '&auml;' => '&#x0E4;',               # ä
555                '&aring;' => '&#x0E5;',      # Ã¥
556                '&ccedil;' => '&#x0E7;',     # ç
557                '&egrave;' => '&#x0E8;',
558                '&eacute;' => '&#x0E9;',
559                '&ecirc;' => '&#x0EA;',
560                '&euml;' => '&#x0EB;',       # euml
561                '&Icirc;' => '&#x0CE;',      # ÃŽ
562                '&Eacute;' => '&#x0C9;',
563                '&szlig;' => '&#x0DF;',      # ß
564                '&iacute;' => '&#xED;',      # í
565                '&icirc;' => '&#x0EE;',
566                '&iuml;' => '&#x0EF;',       # ï
567                '&eth;' => '&#x0F0;',        # ð
568                '&uacute;' => '&#0FA;',      # ú
569                '&uuml;' => '&#x0FC;',
570                '&scaron;' => '&#x161;',
571                '&oacute;' => '&#x0F3;',     # ó
572                '&ucirc;' => '&#x0F4;',
573                '&ouml;' => '&#x0F6;',
574                '&ordf;' => '&#x0AA;',       # ª
575                '&oslash;' => '&#x0F8;',     # ø
576                '&zcaron;' => '&#x17E;',
577                '&Scaron;' => '&#x160;',
578                '&THORN;' => '&#x0DE;',      # Þ
579                '&thorn;' => '&#x0FE;',      # þ
580                '&reg;' => '&#x0AE;',
581                '&pound;' => '&#163;',
582                '&ntilde;' => '&#x0F1;',
583                '&mdash;' => '-',
584                '&ndash;' => '-',
585                '&excl;' => '!',
586        );
587
588
24
32
        $string =~ s{(.)}{
589
611
329
                my $cp = $1;
590                exists $entity_map{$cp}
591
611
587
                        ? $entity_map{$cp}
592                        : $cp
593        }gex;
594
595
24
45
        if($string !~ /[^[:ascii:]]/) {
596
18
92
                return $string;
597        }
598
599        # print STDERR __LINE__, ": ($string)";
600        # print STDERR (sprintf '%v02X', $string);
601        # print STDERR "\n";
602
603        @byte_map = (
604
6
94
                ["\xc2\xa0", ' '],    # Non breaking space
605                ["\xc2\xa3", '&#x0A3;'],  # £
606                ["\xc2\xa9", '&#x0A9;'],
607                ["\xc2\xaa", '&#x0AA;'],  # ª
608                ["\xc2\xab", '&quot;'],   # «
609                ["\xc2\xae", '&#x0AE;'],
610                ["\xc3\x81", '&#x0C1;'],  # Á
611                ["\xc3\x8e", '&#x0CE;'],  # ÃŽ
612                ["\xc3\xa0", '&#x0E0;'],  # à
613                ["\xc3\xa1", '&#x0E1;'],  # á
614                ["\xc3\xa5", '&#x0E5;'],  # Ã¥
615                ["\xc3\xa9", '&#x0E9;'],
616                ["\xc3\xaf", '&#x0EF;'],  # ï
617                ["\xc3\xb1", '&#x0F1;'],  # ntilde ñ
618                ["\xc5\xa1", '&#x161;'],
619                ["\xc4\x8d", '&#x10D;'],
620                ["\xc5\xbe", '&#x17E;'],  # ž
621                ["\xc3\x96", '&#x0D6;'],  # Ö
622                ["\xc3\x9e", '&#x0DE;'],  # Þ
623                ["\xc3\x9f", '&#x0DF;'],  # ß
624                ["\xc3\xa2", '&#x0E2;'],  # â
625                ["\xc3\xad", '&#x0ED;'],  # í
626                ["\xc3\xa4", '&#x0E4;'],  # ä
627                ["\xc3\xa7", '&#x0E7;'],  # ç
628                ["\xc3\xb0", '&#x0F0;'],  # ð
629                ["\xc3\xb3", '&#x0F3;'],  # ó
630                ["\xc3\xb8", '&#x0F8;'],  # ø
631                ["\xc3\xbc", '&#x0FC;'],  # ü
632                ["\xc3\xbe", '&#x0FE;'],  # þ
633                ["\xc3\xa8", '&#x0E8;'],  # è
634                ["\xc3\xee", '&#x0EE;'],
635                ["\xc3\xb4", '&#x0F4;'],  # ô
636                ["\xc3\xb6", '&#x0F6;'],  # ö
637                ["\xc3\x89", '&#x0C9;'],
638                ["\xc3\xaa", '&#x0EA;'],
639                ["\xc3\xab", '&#x0EB;'],  # eumlaut
640                ["\xc3\xba", '&#x0FA;'],  # ú
641                ["\xc3\xbb", '&#x0BB;'],  # û - ucirc
642                ["\xc5\x9b", '&#x15B;'],  # Å› - sacute
643                ["\xc5\xa0", '&#x160;'],
644                ["\xe2\x80\x93", '-'],
645                ["\xe2\x80\x94", '-'],
646                ["\xe2\x80\x9c", '&quot;'],
647                ["\xe2\x80\x9d", '&quot;'],
648                ["\xe2\x80\xa6", '...'],
649                ["\xe2\x97\x8f", '&#x25CF;'],     # ●
650                ["\xe3\xb1", '&#x0F1;'],  # ntilde ñ - what's this one?
651
652        # $string =~ s/\xe4\x8d/&#x10D;/g;  # ? ACOM strangeness
653        # $string =~ s/\N{U+0161}/&#x161;/g;
654        # $string =~ s/\N{U+010D}/&#x10D;/g;
655        # $string =~ s/\N{U+00E9}/&#x0E9;/g;
656        # $string =~ s/\N{U+017E}/&#x17E;/g;
657
658                ["\N{U+00A0}", ' '],
659                ["\N{U+010D}", '&#x10D;'],
660                ["\N{U+00AB}", '&quot;'], # «
661                ["\N{U+00AE}", '&#x0AE;'],        # ®
662                ["\N{U+00B5}", '&#x0B5;'],        # µ
663                ["\N{U+00C1}", '&#x0C1;'],        # Á
664                ["\N{U+00CE}", '&#x0CE;'],        # ÃŽ
665                ["\N{U+00DE}", '&#x0DE;'],        # Þ
666                ["\N{U+00E4}", '&#x0E4;'],        # ä
667                ["\N{U+00E5}", '&#x0E5;'],        # Ã¥
668                ["\N{U+00EA}", '&#x0EA;'],
669                ["\N{U+00ED}", '&#x0ED;'],
670                ["\N{U+00EE}", '&#x0EE;'],
671                ["\N{U+00FE}", '&#x0FE;'],        # þ
672                ["\N{U+00C9}", '&#x0C9;'],
673                ["\N{U+017E}", '&#x17E;'],        # ž
674                ["\N{U+00D6}", '&#x0D6;'],        # Ö
675                ["\N{U+00DF}", '&#x0DF;'],        # ß
676                ["\N{U+00E1}", '&#x0E1;'],        # á - aacute
677                ["\N{U+00E2}", '&#x0E2;'],
678                ["\N{U+00E8}", '&#x0E8;'],        # è
679                ["\N{U+00EF}", '&#x0EF;'],        # ï
680                ["\N{U+00F0}", '&#x0F0;'],        # ð
681                ["\N{U+00F1}", '&#x0F1;'],        # ñ
682                ["\N{U+00F3}", '&#x0F3;'],        # ó
683                ["\N{U+00F4}", '&#x0F4;'],        # ô
684                ["\N{U+00F6}", '&#x0F6;'],        # ö
685                ["\N{U+00F8}", '&#x0F8;'],        # ø
686                ["\N{U+00FA}", '&#x0FA;'],        # ú
687                ["\N{U+00FC}", '&#x0FC;'],        # ü
688                ["\N{U+015B}", '&#x15B;'],        # Å›
689        # print STDERR __LINE__, ": ($string)";
690        # print STDERR (sprintf '%v02X', $string);
691        # print STDERR "\n";
692                ["\N{U+00E9}", '&#x0E9;'],
693        # print STDERR __LINE__, ": ($string)";
694        # print STDERR (sprintf '%v02X', $string);
695        # print STDERR "\n";
696                ["\N{U+00E7}", '&#x0E7;'],        # ç
697                ["\N{U+00EB}", '&#x0EB;'],        # ë
698                ["\N{U+00FB}", '&#x0FB;'],        # û
699                ["\N{U+0160}", '&#x160;'],
700                ["\N{U+0161}", '&#x161;'],
701                ["\N{U+00A9}", '&#x0A9;'],        # ©
702        # print STDERR __LINE__, ": ($string)";
703        # print STDERR (sprintf '%v02X', $string);
704        # print STDERR "\n";
705                ["\N{U+2013}", '-'],
706                ["\N{U+2014}", '-'],
707                ["\N{U+2018}", '&quot;'],
708                ["\N{U+2019}", '&quot;'],
709                ["\N{U+201C}", '&quot;'],
710                ["\N{U+201D}", '&quot;'],
711                ["\N{U+2026}", '...'],        # …
712                ["\N{U+25CF}", '&#x25CF;'],       # ●
713        );
714
715
6
7
        $string = _sub_map(\$string, \@byte_map);
716
717        # utf8::encode($string);
718        # $string =~ s/Å¡/&s#x161;/g;
719        # $string =~ s/č/&#x10D;/g;
720        # $string =~ s/ž/&z#x17E;/g;
721        # $string =~ s/é/&#x0E9;/g;
722        # $string =~ s/Ö/&#x0D6;/g;
723        # $string =~ s/ç/&#x0E7;/g;
724        # $string =~ s/\N{U+0161}/&#x161;/g;
725        # $string =~ s/\N{U+010D}/&#x10D;/g;
726        # $string =~ s/\N{U+017E}/&#x17E;/g;
727        # $string =~ s/\N{U+00E9}/&#x0E9;/g;
728        # $string =~ s/\N{U+00D6}/&#x0D6;/g;        # Ö
729        # $string =~ s/\N{U+00E7}/&#x0E7;/g;        # ç
730
731        # print STDERR __LINE__, ": ($string)";
732        # print STDERR (sprintf '%v02X', $string);
733        # print STDERR "\n";
734
735        # utf8::decode($string);
736
737        # $string =~ s/['\x98]/&#039;/g;
738
6
70
        @byte_map = (
739                ["'", '&#039;'],
740                ["\x98", '&#039;'],
741                ['©', '&#x0A9;'],
742                ['ª', '&#x0AA;'],
743                ['®', '&#x0AE;'],
744                ['Ã¥', '&#x0E5;'],
745                ['Å¡', '&#x161;'],
746                ['č', '&#x10D;'],
747                ['ž', '&#x17E;'],
748                ['£', '&#x0A3;'],
749                ['µ', '&#x0B5;'],
750                ['á', '&#x0E1;'],     # á
751                ['â', '&#x0E2;'],
752                ['ä', '&#x0E4;'],    # ä
753                ['Á', '&#x0C1;'],      # Á
754                ['Ö', '&#x0D6;'],
755                ['ß', '&#x0DF;'],
756                ['ç', '&#x0E7;'],
757                ['è', '&#x0E8;'],
758                ['é', '&#x0E9;'],
759                ['ê', '&#x0EA;'],
760                ['ë', '&#x0EB;'],
761                ['í', '&#x0ED;'],
762                ['ï', '&#x0EF;'],
763                ['ÃŽ', '&#x0CE;'],      # ÃŽ
764                ['Þ', '&#x0DE;'],      # Þ
765                ['ð', '&#x0F0;'],       # ð
766                ['ø', '&#x0F8;'],     # ø
767                ['û', '&#x0FB;'],
768                ['ñ', '&#x0F1;'],
769                ['ú', '&#x0FA;'],
770                ['ü', '&#x0FC;'],
771                ['þ', '&#x0FE;'],    # þ
772                ['“', '&quot;'],
773                ['”', '&quot;'],
774                ['«', '&quot;'],
775                ['»', '&quot;'],
776                ['—', '-'],
777                ['–', '-'],
778                ['…', '...'],
779                ['●', '&#x25CF;'],
780                ["\x80\$", ' '],
781        );
782
783
6
7
        $string = _sub_map(\$string, \@byte_map);
784
785        # if($string =~ /^Maria\(/) {
786                # print STDERR (unpack 'H*', $string);
787                # print STDERR __LINE__, ': ';
788                # print STDERR (sprintf '%v02X', $string);
789                # print STDERR "\n";
790                # my $i = 0;
791                # while((my @call_details = caller($i++))) {
792                        # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
793                # }
794                # die $string;
795        # }
796
797        # print STDERR __LINE__, ": ($string)\n";
798
6
11
        if($string =~ /[^[:ascii:]]/) {
799
1
12
                print STDERR (unpack 'H*', $string);
800
1
4
                print STDERR __LINE__, ': ';
801
1
3
                print STDERR (sprintf '%v02X', $string);
802
1
1
                print STDERR "\n";
803
1
1
                my $i = 0;
804
1
4
                while((my @call_details = caller($i++))) {
805
2
37
                        print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
806                }
807
1
20
                $complain->("TODO: wide_to_xml($string)") if($complain);
808
1
7
                warn "TODO: wide_to_xml($string)";
809                # $string =~ s/[^[:ascii:]]/XXXXX/g;
810
1
315
                $string =~ s{
811                                ([^[:ascii:]])
812                        }{
813
2
5
                                '>>>>' . sprintf("%04X", ord($1)) . '<<<<'
814                        }gex;   # e=evaluate, g=global, x=extended
815
1
6
                die "BUG: wide_to_xml($string)";
816        }
817
5
33
        return $string;
818}
819
820sub _sub_map
821{
822
72
72
41
54
        my $string = ${$_[0]};
823
72
49
        my $byte_map = $_[1];
824
825        # Build an alternation sorted by longest sequence first
826        my $pattern = join '|',
827
2226
1421
                map { quotemeta($_->[0]) }
828
3738
1996
                sort { length $b->[0] <=> length $a->[0] }
829
72
72
45
103
                @{$byte_map};
830
831
72
78
2523
42
        $string =~ s/($pattern)/do {
832
78
51
                my $bytes = $1;
833
78
3762
78
35
2074
52
                my ($pair) = grep { $_->[0] eq $bytes } @{$byte_map};
834
78
106
                $pair->[1];
835        }/ge;
836
837
72
159
        return $string;
838}
839
840 - 890
=head1 SEE ALSO

=over 4

=item * Test coverage report: L<https://nigelhorne.github.io/Encode-Wide/coverage/>

=item * L<HTML::Entities>

=item * L<Encode>

=item * L<XML::Entities>

=item * L<Unicode::Escape>

=item * L<https://www.compart.com/en/unicode/>

=back

=head1 SUPPORT

This module is provided as-is without any warranty.

Please report any bugs or feature requests to C<bug-encode-wide at rt.cpan.org>,
or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Encode-Wide>.
I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.

=head1 AUTHOR

Nigel Horne, C<< <njh at nigelhorne.com> >>

=head1 LICENCE AND COPYRIGHT

Copyright 2025 Nigel Horne.

Usage is subject to licence terms.

The licence terms of this software are as follows:

=over 4

=item * Personal single user, single computer use: GPL2

=item * All other users (including Commercial, Charity, Educational, Government)
  must apply in writing for a licence for use from Nigel Horne at the
  above e-mail.

=back

=cut
891
8921;