File Coverage

File:blib/lib/Encode/Wide.pm
Coverage:84.4%

linestmtbrancondsubtimecode
1package Encode::Wide;
2
3# TODO: don't transform anything within <script>...</script> in wide_to_html
4
5
3
3
3
68419
2
44
use strict;
6
3
3
3
5
2
54
use warnings;
7
8
3
3
3
6
1
29
use Exporter qw(import);
9
3
3
3
574
6869
120
use HTML::Entities;
10
3
3
3
567
16166
56
use Params::Get 0.13;
11
3
3
3
854
11510
3902
use Term::ANSIColor;
12
13our @EXPORT_OK = qw(wide_to_html wide_to_xml);
14
15# Encode to HTML whatever the non-ASCII encoding scheme has been chosen
16# Can't use HTML:Entities::encode since that doesn't seem to cope with
17#       all encodings and misses some characters
18#
19# See https://www.compart.com/en/unicode/U+0161 etc.
20#       https://www.compart.com/en/unicode/U+00EB
21#
22# keep_hrefs => 1 means ensure hyperlinks still work
23# keep_apos => 1 means keep apostrophes, useful within <script>
24
25=encoding UTF-8
26
27 - 35
=head1 NAME

Encode::Wide - Convert wide characters (Unicode, UTF-8, etc.) into HTML or XML-safe ASCII entities

=head1 VERSION

0.06

=cut
36
37our $VERSION = 0.06;
38
39 - 104
=head1 SYNOPSIS

    use Encode::Wide qw(wide_to_html wide_to_xml);

    my $html = wide_to_html(string => "Café déjà vu – naïve façade");
    # returns: 'Caf&eacute; d&eacute;j&agrave; vu &ndash; na&iuml;ve fa&ccedil;ade'

    my $xml = wide_to_xml(string => "Café déjà vu – naïve façade");
    # returns: 'Caf&#xE9; d&#xE9;j&#xE0; vu &#x2013; na&#xEF;ve fa&#xE7;ade'

=head1 DESCRIPTION

Encode::Wide provides functions for converting wide (Unicode) characters into ASCII-safe
formats suitable for embedding in HTML or XML documents.
It is especially useful when dealing with text containing accented or typographic characters that need
to be safely represented in markup.

Other modules exist to do this,
however they tend to have assumptions on the input,
whereas this should work with UTF-8, Unicode, or anything that's common.

The module offers two exportable functions:

=over 4

=item * C<wide_to_html(string => $text)>

Converts all non-ASCII characters in the input string to their named HTML entities if available,
or hexadecimal numeric entities otherwise.
Common characters such as `é`, `à`, `&`, `<`, `>` are
converted to their standard HTML representations like `&eacute;`, `&agrave;`, `&amp;`, etc.

=item * C<wide_to_xml(string => $text)>

Converts all non-ASCII characters in the input string to hexadecimal numeric entities.
Unlike HTML, XML does not support many named entities, so this function ensures compliance
by using numeric representations such as `&#xE9;` for `é`.

=back

=head1 PARAMETERS

Both functions accept a named parameter:

=over 4

=item * C<string> — The Unicode string to convert.

=back

=head1 ENCODING

Input strings are expected to be valid UTF-8 or Unicode.
If a byte string is passed, the module will attempt to decode it appropriately.
Output is guaranteed to be pure ASCII.

=head1 EXPORT

None by default.

Optionally exportable:

    wide_to_html
    wide_to_xml

=cut
105
106sub wide_to_html
107{
108
25
280542
        my $params = Params::Get::get_params('string', @_);
109
110
24
324
        my $string = $params->{'string'};
111
24
23
        my $complain = $params->{'complain'};
112
113
24
32
        if(!defined($string)) {
114
0
0
                my $i = 0;
115
0
0
                while((my @call_details = caller($i++))) {
116
0
0
                        print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
117                }
118
0
0
                die 'BUG: wide_to_html() string not set';
119        }
120
121
24
33
        if(ref($string) eq 'SCALAR') {
122
9
9
6
13
                $string = ${$string};
123        }
124
125        # print STDERR __LINE__, ": ($string)";
126        # print STDERR (sprintf '%v02X', $string);
127        # print STDERR "\n";
128        # my $i = 0;
129        # while((my @call_details = caller($i++))) {
130                # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
131        # }
132
133
24
57
        $string = HTML::Entities::decode($string);
134        # $string =~ s/ & / &amp; /g;
135
136        # I don't think HTML::Entities does these
137
24
44
        my %entity_map = (
138                '&ccaron;' => 'č',
139                '&zcaron;' => 'ž',
140                '&Scaron;' => 'Å ',
141        );
142
143
24
67
        $string =~ s{
144                # ([\x80-\x{10FFFF}])
145                (.)
146        }{
147
514
459
                my $cp = $1;
148                exists $entity_map{$cp}
149
514
744
                        ? $entity_map{$cp}
150                        : $cp
151        }gex;
152
153        # Escape only if it's not already part of an entity
154
24
52
        $string =~ s/&(?![A-Za-z#0-9]+;)/&amp;/g;
155
156
24
32
        unless($params->{'keep_hrefs'}) {
157
22
64
                %entity_map = (
158                        '<' => '&lt;',
159                        '>' => '&gt;',
160                        '"' => '&quot;',
161                );
162
22
31
                $string =~ s{(.)}{
163
458
348
                        my $cp = $1;
164                        exists $entity_map{$cp}
165
458
546
                                ? $entity_map{$cp}
166                                : $cp
167                }gex;
168        }
169
170        # $string =~ s/&db=/&amp;db=/g;
171        # $string =~ s/&id=/&amp;id=/g;
172
173        # Table of byte-sequences->entities
174
24
82
        my @byte_map = (
175                ['“', '&quot;'],  # U+201C
176                ['”', '&quot;'],  # U+201D
177                ["\xe2\x80\x9c", '&quot;'],       # “
178                ["\xe2\x80\x9d", '&quot;'],       # ”
179                ["\xe2\x80\x93", '&ndash;'],
180                ["\xe2\x80\x94", '&mdash;'],
181                ["\xe2\x80\x98", '&apos;'],       # ‘
182                ["\xe2\x80\x99", '&apos;'],       # ’
183                ["\xe2\x80\xA6", '...'],      # …
184                ['!', '&excl;'],    # Do this early before the ascii check, since it's an ascii character
185        );
186
187
24
30
        $string = _sub_map(\$string, \@byte_map);
188
189
24
38
        unless($params->{'keep_apos'}) {
190                # We can't combine since each char in the multi-byte matches, not the entire multi-byte
191                # $string =~ s/['‘’‘\x98]/&apos;/g;
192
22
45
                %entity_map = (
193                        "'" => '&apos;',
194                        '‘' => '&apos;',
195                        '’' => '&apos;',
196                        '‘' => '&apos;',
197                        "\x98" => '&apos;',
198                );
199
200
22
28
                $string =~ s{
201                        # ([\x80-\x{10FFFF}])
202                        (.)
203                }{
204
571
316
                        my $cp = $1;
205                        exists $entity_map{$cp}
206
571
513
                                ? $entity_map{$cp}
207                                : $cp
208                }gex;
209        }
210
211
24
43
        if($string !~ /[^[:ascii:]]/) {
212
18
59
                return $string;
213        }
214
215        @byte_map = (
216
6
105
                ["\xc2\xa0", ' '],    # Non breaking space
217                ["\xc2\xa3", '&pound;'],
218                ["\xc2\xa9", '&copy;'],
219                ["\xc2\xae", '&reg;'],
220                ["\xc3\xa2", '&acirc;'],
221                ["\xc3\xa4", '&auml;'],
222                ["\xc3\xa9", '&eacute;'],
223                ["\xc2\xaa", '&ordf;'],   # ª
224                ["\xc2\xab", '&quot;'],   # «
225                ["\xc2\xbb", '&quot;'],   # »
226                ["\xc3\x81", '&Aacute;'], # Á
227                ["\xc3\x83", '&Icirc;'],  # ÃŽ
228                ["\xc3\x9e", '&THORN;'],  # Þ
229                ["\xc3\xa0", '&agrave;'], # à
230                ["\xc3\xa1", '&aacute;'], # á
231                ["\xc3\xad", '&iacute;'], # í
232                ["\xc3\xb0", '&eth;'],    # ð
233                ["\xc3\xba", '&uacute;'], # ú
234                ["\xc3\xb4", '&ocirc;'],  # ô
235                ["\xc3\xb6", '&ouml;'],
236                ["\xc3\xb8", '&oslash;'], # ø
237                ["\xc5\xa1", '&scaron;'],
238                ["\xc4\x8d", '&ccaron;'],
239                ["\xc5\xbe", '&zcaron;'],
240                ["\xc3\xa5", '&aring;'],  # Ã¥
241                ["\xc3\xa7", '&ccedil;'],
242                ["\xc3\xaf", '&iuml;'],   # ï
243                ["\xc3\xb3", '&oacute;'],
244                ["\xc3\x96", '&Ouml;'], # Ö
245                ["\xc3\xa8", '&egrave;'],
246                ["\xc3\x89", '&Eacute;'],
247                ["\xc3\x9f", '&szlig;'],
248                ["\xc3\xaa", '&ecirc;'],
249                ["\xc3\xab", '&euml;'],
250                ["\xc3\xae", '&icirc;'],
251                ["\xc3\xbb", '&ucirc;'],
252                ["\xc3\xbc", '&uuml;'], # ü
253                ["\xc3\xbe", '&thorn;'],  # þ
254                ["\xc5\x9b", '&sacute;'],
255                ["\xc5\xa0", '&Scaron;'],
256                ["\xe2\x80\x93", '&ndash;'],
257                ["\xe2\x80\x94", '&mdash;'],
258                ["\xc3\xb1", '&ntilde;'], # ñ
259                ["\xe2\x80\x9c", '&quot;'],
260                ["\xe2\x80\x9d", '&quot;'],
261                ["\xe2\x80\xa6", '...'],
262                ["\xe2\x97\x8f", '&#x25CF;'],     # ●
263                ["\N{U+00A0}", ' '],
264                ["\N{U+00A3}", '&pound;'],
265                ["\N{U+00A9}", '&copy;'],
266                ["\N{U+00AA}", '&ordf;'], # ª
267                ["\N{U+00AB}", '&quot;'], # «
268                ["\N{U+00AE}", '&reg;'],
269                ["\N{U+00B5}", '&micro;'],        # µ
270                ["\N{U+00BB}", '&quot;'], # »
271                ["\N{U+00CE}", '&Icirc;'],        # ÃŽ
272                ["\N{U+00DE}", '&THORN;'],        # Þ
273                ["\N{U+0161}", '&scaron;'],
274                ["\N{U+010D}", '&ccaron;'],
275                ["\N{U+017E}", '&zcaron;'],
276                ["\N{U+00C9}", '&Eacute;'],
277                ["\N{U+00D6}", '&Ouml;'], # Ö
278                ["\N{U+00DF}", '&szlig;'],        # ß
279                ["\N{U+00E1}", '&aacute;'],       # á
280                ["\N{U+00E2}", '&acirc;'],
281                ["\N{U+00E4}", '&auml;'],
282                ["\N{U+00E5}", '&aring;'],        # Ã¥
283                ["\N{U+00E7}", '&ccedil;'],       # ç
284                ["\N{U+00E8}", '&egrave;'],
285                ["\N{U+00E9}", '&eacute;'],
286                ["\N{U+00ED}", '&iacute;'],       # í
287                ["\N{U+00EE}", '&icirc;'],
288                ["\N{U+00EF}", '&iuml;'], # ï
289                ["\N{U+00F0}", '&eth;'],  # ð
290                ["\N{U+00F1}", '&ntilde;'],       # ñ
291                ["\N{U+00F4}", '&ocirc;'],        # ô
292                ["\N{U+00F6}", '&ouml;'],
293                ["\N{U+00F8}", '&oslash;'],       # ø
294                ["\N{U+00FA}", '&uacute;'],       # ú
295                ["\N{U+00FC}", '&uuml;'], # ü
296                ["\N{U+00FE}", '&thorn;'],        # þ
297                ["\N{U+00C1}", '&Aacute;'],       # Á
298                ["\N{U+00C9}", '&Eacute;'],
299                ["\N{U+00CA}", '&ecirc;'],
300                ["\N{U+00EB}", '&euml;'],
301                ["\N{U+00F3}", '&oacute;'],
302                ["\N{U+015B}", '&sacute;'],
303                ["\N{U+00FB}", '&ucirc;'],
304                ["\N{U+0160}", '&Scaron;'],
305                ["\N{U+2013}", '&ndash;'],
306                ["\N{U+2014}", '&mdash;'],
307                ["\N{U+2018}", '&quot;'],
308                ["\N{U+2019}", '&quot;'],
309                ["\N{U+201C}", '&quot;'],
310                ["\N{U+201D}", '&quot;'],
311                ["\N{U+2026}", '...'],        # …
312                ["\N{U+25CF}", '&#x25CF;'],       # ●
313        );
314
315
6
7
        $string = _sub_map(\$string, \@byte_map);
316
317        # utf8::encode($string);
318        # $string =~ s/Å¡/&scaron;/g;
319        # $string =~ s/č/&ccaron;/g;
320        # $string =~ s/ž/&zcaron;/g;
321        # $string =~ s/é/&eacute;/g;
322        # $string =~ s/ç/&ccedil;/g;
323        # $string =~ s/\N{U+0161}/&scaron;/g;
324        # $string =~ s/\N{U+010D}/&ccaron;/g;
325        # $string =~ s/\N{U+017E}/&zcaron;/g;
326        # $string =~ s/\N{U+00E9}/&eacute;/g;
327        # $string =~ s/\N{U+00D6}/&Ouml;/g; # Ö
328        # $string =~ s/\N{U+00E7}/&ccedil;/g;       # ç
329        # $string =~ s/\N{U+00E8}/&egrave;/g;
330        # $string =~ s/\N{U+00E9}/&Eacute;/g;
331
332        # print STDERR __LINE__, ": ($string)";
333        # print STDERR (sprintf '%v02X', $string);
334        # print STDERR "\n";
335
336        # utf8::decode($string);
337
338        # print STDERR __LINE__, ": ($string)";
339        # # print STDERR (sprintf '%v02X', $string);
340        # print STDERR "\n";
341
342        # $string =~ s/\xe4\x8d/&ccaron;/g; # ? ACOM strangeness
343        # $string =~ s/\N{U+0161}/&scaron;/g;
344        # $string =~ s/\N{U+010D}/&ccaron;/g;
345        # $string =~ s/\N{U+00A9}/&copy;/g;
346        # $string =~ s/\N{U+00AE}/&reg;/g;
347        # $string =~ s/\N{U+00E2}/&acirc;/g;
348        # $string =~ s/\N{U+00E4}/&auml;/g;
349        # $string =~ s/\N{U+00E8}/&egrave;/g;
350        # $string =~ s/\N{U+00E9}/&eacute;/g;
351        # $string =~ s/\N{U+00EB}/&euml;/g;
352        # $string =~ s/\N{U+00F3}/&oacute;/g;
353        # $string =~ s/\N{U+00FB}/&ucirc;/g;
354        # $string =~ s/\N{U+017E}/&zcaron;/g;
355        # $string =~ s/\N{U+00D6}/&Ouml;/g; # Ö
356        # $string =~ s/\N{U+00E7}/&ccedil;/g;       # ç
357        # $string =~ s/\N{U+00C9}/&Eacute;/g;
358        # $string =~ s/\N{U+00CA}/&ecirc;/g;
359        # $string =~ s/\N{U+0160}/&Scaron;/g;       # FIXME: also above
360        # $string =~ s/\N{U+2013}/-/g;
361
362
6
82
        @byte_map = (
363                [ 'Á', '&Aacute;' ],
364                [ 'Ã¥', '&aring;' ],
365                [ 'ª', '&ordf;' ],
366                [ 'Å¡', '&scaron;' ],
367                [ 'Å ', '&Scaron;' ],
368                [ 'č', '&ccaron;' ],
369                [ 'ž', '&zcaron;' ],
370                [ 'á', '&aacute;' ],
371                [ 'â', '&acirc;' ],
372                [ 'é', '&eacute;' ],
373                [ 'è', '&egrave;' ],
374                [ 'ç', '&ccedil;' ],
375                [ 'ê', '&ecirc;' ],
376                [ 'ë', '&euml;' ],
377                [ 'ð', '&eth;' ],
378                [ 'í', '&iacute;' ],
379                [ 'ï', '&iuml;' ],
380                [ 'ÃŽ', '&Iicrc;' ],
381                [ '©', '&copy;' ],
382                [ '®', '&reg;' ],
383                [ 'ó', '&oacute;' ],
384                [ 'ô', '&ocirc;' ],
385                [ 'ö', '&ouml;' ],
386                [ 'ø', '&oslash;' ],
387                [ 'Å›', '&sacute;' ],
388                [ 'Þ', '&THORN;' ],
389                [ 'þ', '&thorn;' ],
390                [ 'û', '&ucirc;' ],
391                [ 'ü', '&uuml;' ],
392                [ 'ú', '&uacute;' ],
393                [ 'µ', '&micro;'],
394                [ '£', '&pound;' ],
395                [ 'ß', '&szlig;' ],
396                [ '–', '&ndash;' ],
397                [ '—', '&mdash;' ],
398                [ 'ñ', '&ntilde;' ],
399                [ '“', '&quot;' ],
400                [ '”', '&quot;' ],
401                [ '«', '&quot;' ],
402                [ '»', '&quot;' ],
403                [ '…', '...' ],
404                [ '●', '&#x25CF;' ],
405                [ "\x80\$", ' ' ],
406        );
407
408
6
7
        $string = _sub_map(\$string, \@byte_map);
409
410        # if($string =~ /^Maria\(/) {
411                # # print STDERR (unpack 'H*', $string);
412                # print STDERR __LINE__, ': ';
413                # print STDERR (sprintf '%v02X', $string);
414                # print STDERR "\n";
415                # my $i = 0;
416                # while((my @call_details = caller($i++))) {
417                        # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
418                # }
419                # # die $string;
420        # }
421
422        # print STDERR __LINE__, ": ($string)\n";
423
6
11
        if($string =~ /[^[:ascii:]]/) {
424
1
2
                $string = HTML::Entities::encode_entities_numeric($string, '\x80-\x{10FFFF}');
425
1
39
                if($string =~ /[^[:ascii:]]/) {
426
0
0
                        print STDERR (unpack 'H*', $string);
427
0
0
                        print STDERR __LINE__, ': ';
428
0
0
                        print STDERR (sprintf '%v02X', $string), "\n";
429
0
0
                        my $i = 0;
430
0
0
                        while((my @call_details = caller($i++))) {
431
0
0
                                print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
432                        }
433
0
0
                        $complain->("TODO: wide_to_html($string)") if($complain);
434
0
0
                        warn "TODO: wide_to_html($string)";
435                        # $string =~ s/[^[:ascii:]]/XXXXX/g;
436
0
0
                        $string =~ s{
437                                        ([^[:ascii:]])
438                                }{
439
0
0
                                        '>>>>' . sprintf("%04X", ord($1)) . '<<<<'
440                                }gex;   # e=evaluate, g=global, x=extended
441
0
0
                        die "BUG: wide_to_html($string)";
442                }
443        }
444
445
6
26
        return $string;
446}
447
448# See https://www.compart.com/en/unicode/U+0161 etc.
449#       https://www.compart.com/en/unicode/U+00EB
450sub wide_to_xml
451{
452
25
10643
        my $params = Params::Get::get_params('string', @_);
453
454
24
324
        my $string = $params->{'string'};
455
24
22
        my $complain = $params->{'complain'};
456
457
24
32
        if(!defined($string)) {
458
0
0
                my $i = 0;
459
0
0
                while((my @call_details = caller($i++))) {
460
0
0
                        print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
461                }
462
0
0
                die 'BUG: string not set';
463        }
464
465
24
33
        if(ref($string) eq 'SCALAR') {
466
9
9
7
11
                $string = ${$string};
467        }
468
469        # print STDERR __LINE__, ": ($string)";
470        # print STDERR (sprintf '%v02X', $string);
471        # print STDERR "\n";
472
473        # my $i = 0;
474        # while((my @call_details = caller($i++))) {
475                # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
476        # }
477
478
24
62
        $string = HTML::Entities::decode($string);
479        # print STDERR __LINE__, ": ($string)\n";
480
481        # $string =~ s/&amp;/&/g;
482
483        # I don't think HTML::Entities does these
484
24
39
        my %entity_map = (
485                '&ccaron;' => 'č',
486                '&zcaron;' => 'ž',
487                '&Scaron;' => 'Å ',
488        );
489
490
24
64
        $string =~ s{
491                # ([\x80-\x{10FFFF}])
492                (.)
493        }{
494
514
434
                my $cp = $1;
495                exists $entity_map{$cp}
496
514
748
                        ? $entity_map{$cp}
497                        : $cp
498        }gex;
499
500        # Escape only if it's not already part of an entity
501
24
52
        $string =~ s/&(?![A-Za-z#0-9]+;)/&amp;/g;
502
503
24
34
        unless($params->{'keep_hrefs'}) {
504
22
57
                %entity_map = (
505                        '<' => '&lt;',
506                        '>' => '&gt;',
507                        '"' => '&quot;',
508                        '“' => '&quot;',       # U+201C
509                        '”' => '&quot;',       # U+201D
510                );
511
512
22
30
                $string =~ s{(.)}{
513
458
322
                        my $cp = $1;
514                        exists $entity_map{$cp}
515
458
516
                                ? $entity_map{$cp}
516                                : $cp
517                }gex;
518        }
519
520        # $string =~ s/‘/&apos;/g;
521        # $string =~ s/’/&apos;/g;
522        # $string =~ s/‘/&apos;/g;
523        # $string =~ s/‘/&apos;/g;
524        # $string =~ s/\x98/&apos;/g;
525        # $string =~ s/['‘’‘\x98]/&apos;/g;
526
527        # Table of byte-sequences->entities
528
24
93
        my @byte_map = (
529                [ "\xe2\x80\x9c", '&quot;' ],     # “
530                [ "\xe2\x80\x9d", '&quot;' ],     # ”
531                [ '“', '&quot;' ],        # U+201C
532                [ '”', '&quot;' ],        # U+201D
533                [ "\xe2\x80\x93", '-' ],      # ndash
534                [ "\xe2\x80\x94", '-' ],      # mdash
535                [ "\xe2\x80\x98", '&apos;' ],     # ‘
536                [ "\xe2\x80\x99", '&apos;' ],     # ’
537                [ "\xe2\x80\xA6", '...' ],    # …
538                [ "'", '&apos;' ],
539                [ '‘', '&apos;' ],
540                [ '’', '&apos;' ],
541                [ '‘', '&apos;' ],
542                [ "\x98", '&apos;' ],
543        );
544
545
24
31
        $string = _sub_map(\$string, \@byte_map);
546
547
24
183
        %entity_map = (
548                '&copy;' => '&#x0A9;',
549                '&Aacute;' => '&#x0C1;',     # Á
550                '&ccaron;' => '&#x10D;',
551                '&agrave;' => '&#x0E0;',     # á
552                '&aacute;' => '&#x0E1;',     # á
553                '&acirc;' => '&#x0E2;',              # â
554                '&auml;' => '&#x0E4;',               # ä
555                '&aring;' => '&#x0E5;',      # Ã¥
556                '&ccedil;' => '&#x0E7;',     # ç
557                '&egrave;' => '&#x0E8;',
558                '&eacute;' => '&#x0E9;',
559                '&ecirc;' => '&#x0EA;',
560                '&euml;' => '&#x0EB;',       # euml
561                '&Icirc;' => '&#x0CE;',      # ÃŽ
562                '&Eacute;' => '&#x0C9;',
563                '&szlig;' => '&#x0DF;',      # ß
564                '&iacute;' => '&#xED;',      # í
565                '&icirc;' => '&#x0EE;',
566                '&iuml;' => '&#x0EF;',       # ï
567                '&eth;' => '&#x0F0;',        # ð
568                '&uacute;' => '&#0FA;',      # ú
569                '&uuml;' => '&#x0FC;',
570                '&scaron;' => '&#x161;',
571                '&oacute;' => '&#x0F3;',     # ó
572                '&ucirc;' => '&#x0F4;',
573                '&ouml;' => '&#x0F6;',
574                '&ordf;' => '&#x0AA;',       # ª
575                '&oslash;' => '&#x0F8;',     # ø
576                '&zcaron;' => '&#x17E;',
577                '&Scaron;' => '&#x160;',
578                '&THORN;' => '&#x0DE;',      # Þ
579                '&thorn;' => '&#x0FE;',      # þ
580                '&reg;' => '&#x0AE;',
581                '&pound;' => '&#163;',
582                '&ntilde;' => '&#x0F1;',
583                '&mdash;' => '-',
584                '&ndash;' => '-',
585                '&excl;' => '!',
586        );
587
588
24
42
        $string =~ s{(.)}{
589
611
340
                my $cp = $1;
590                exists $entity_map{$cp}
591
611
540
                        ? $entity_map{$cp}
592                        : $cp
593        }gex;
594
595
24
45
        if($string !~ /[^[:ascii:]]/) {
596
18
89
                return $string;
597        }
598
599        # print STDERR __LINE__, ": ($string)";
600        # print STDERR (sprintf '%v02X', $string);
601        # print STDERR "\n";
602
603        @byte_map = (
604
6
80
                ["\xc2\xa0", ' '],    # Non breaking space
605                ["\xc2\xa3", '&#x0A3;'],  # £
606                ["\xc2\xa9", '&#x0A9;'],
607                ["\xc2\xaa", '&#x0AA;'],  # ª
608                ["\xc2\xab", '&quot;'],   # «
609                ["\xc2\xae", '&#x0AE;'],
610                ["\xc3\x81", '&#x0C1;'],  # Á
611                ["\xc3\x8e", '&#x0CE;'],  # ÃŽ
612                ["\xc3\xa0", '&#x0E0;'],  # à
613                ["\xc3\xa1", '&#x0E1;'],  # á
614                ["\xc3\xa5", '&#x0E5;'],  # Ã¥
615                ["\xc3\xa9", '&#x0E9;'],
616                ["\xc3\xaf", '&#x0EF;'],  # ï
617                ["\xc3\xb1", '&#x0F1;'],  # ntilde ñ
618                ["\xc5\xa1", '&#x161;'],
619                ["\xc4\x8d", '&#x10D;'],
620                ["\xc5\xbe", '&#x17E;'],  # ž
621                ["\xc3\x96", '&#x0D6;'],  # Ö
622                ["\xc3\x9e", '&#x0DE;'],  # Þ
623                ["\xc3\x9f", '&#x0DF;'],  # ß
624                ["\xc3\xa2", '&#x0E2;'],  # â
625                ["\xc3\xad", '&#x0ED;'],  # í
626                ["\xc3\xa4", '&#x0E4;'],  # ä
627                ["\xc3\xa7", '&#x0E7;'],  # ç
628                ["\xc3\xb0", '&#x0F0;'],  # ð
629                ["\xc3\xb3", '&#x0F3;'],  # ó
630                ["\xc3\xb8", '&#x0F8;'],  # ø
631                ["\xc3\xbc", '&#x0FC;'],  # ü
632                ["\xc3\xbe", '&#x0FE;'],  # þ
633                ["\xc3\xa8", '&#x0E8;'],  # è
634                ["\xc3\xee", '&#x0EE;'],
635                ["\xc3\xb4", '&#x0F4;'],  # ô
636                ["\xc3\xb6", '&#x0F6;'],  # ö
637                ["\xc3\x89", '&#x0C9;'],
638                ["\xc3\xaa", '&#x0EA;'],
639                ["\xc3\xab", '&#x0EB;'],  # eumlaut
640                ["\xc3\xba", '&#x0FA;'],  # ú
641                ["\xc3\xbb", '&#x0BB;'],  # û - ucirc
642                ["\xc5\x9b", '&#x15B;'],  # Å› - sacute
643                ["\xc5\xa0", '&#x160;'],
644                ["\xe2\x80\x93", '-'],
645                ["\xe2\x80\x94", '-'],
646                ["\xe2\x80\x9c", '&quot;'],
647                ["\xe2\x80\x9d", '&quot;'],
648                ["\xe2\x80\xa6", '...'],
649                ["\xe2\x97\x8f", '&#x25CF;'],     # ●
650                ["\xe3\xb1", '&#x0F1;'],  # ntilde ñ - what's this one?
651
652        # $string =~ s/\xe4\x8d/&#x10D;/g;  # ? ACOM strangeness
653        # $string =~ s/\N{U+0161}/&#x161;/g;
654        # $string =~ s/\N{U+010D}/&#x10D;/g;
655        # $string =~ s/\N{U+00E9}/&#x0E9;/g;
656        # $string =~ s/\N{U+017E}/&#x17E;/g;
657
658                ["\N{U+00A0}", ' '],
659                ["\N{U+010D}", '&#x10D;'],
660                ["\N{U+00AB}", '&quot;'], # «
661                ["\N{U+00AE}", '&#x0AE;'],        # ®
662                ["\N{U+00B5}", '&#x0B5;'],        # µ
663                ["\N{U+00C1}", '&#x0C1;'],        # Á
664                ["\N{U+00CE}", '&#x0CE;'],        # ÃŽ
665                ["\N{U+00DE}", '&#x0DE;'],        # Þ
666                ["\N{U+00E4}", '&#x0E4;'],        # ä
667                ["\N{U+00E5}", '&#x0E5;'],        # Ã¥
668                ["\N{U+00EA}", '&#x0EA;'],
669                ["\N{U+00ED}", '&#x0ED;'],
670                ["\N{U+00EE}", '&#x0EE;'],
671                ["\N{U+00FE}", '&#x0FE;'],        # þ
672                ["\N{U+00C9}", '&#x0C9;'],
673                ["\N{U+017E}", '&#x17E;'],        # ž
674                ["\N{U+00D6}", '&#x0D6;'],        # Ö
675                ["\N{U+00DF}", '&#x0DF;'],        # ß
676                ["\N{U+00E1}", '&#x0E1;'],        # á - aacute
677                ["\N{U+00E2}", '&#x0E2;'],
678                ["\N{U+00E8}", '&#x0E8;'],        # è
679                ["\N{U+00EF}", '&#x0EF;'],        # ï
680                ["\N{U+00F0}", '&#x0F0;'],        # ð
681                ["\N{U+00F1}", '&#x0F1;'],        # ñ
682                ["\N{U+00F3}", '&#x0F3;'],        # ó
683                ["\N{U+00F4}", '&#x0F4;'],        # ô
684                ["\N{U+00F6}", '&#x0F6;'],        # ö
685                ["\N{U+00F8}", '&#x0F8;'],        # ø
686                ["\N{U+00FA}", '&#x0FA;'],        # ú
687                ["\N{U+00FC}", '&#x0FC;'],        # ü
688                ["\N{U+015B}", '&#x15B;'],        # Å›
689        # print STDERR __LINE__, ": ($string)";
690        # print STDERR (sprintf '%v02X', $string);
691        # print STDERR "\n";
692                ["\N{U+00E9}", '&#x0E9;'],
693        # print STDERR __LINE__, ": ($string)";
694        # print STDERR (sprintf '%v02X', $string);
695        # print STDERR "\n";
696                ["\N{U+00E7}", '&#x0E7;'],        # ç
697                ["\N{U+00EB}", '&#x0EB;'],        # ë
698                ["\N{U+00FB}", '&#x0FB;'],        # û
699                ["\N{U+0160}", '&#x160;'],
700                ["\N{U+0161}", '&#x161;'],
701                ["\N{U+00A9}", '&#x0A9;'],        # ©
702        # print STDERR __LINE__, ": ($string)";
703        # print STDERR (sprintf '%v02X', $string);
704        # print STDERR "\n";
705                ["\N{U+2013}", '-'],
706                ["\N{U+2014}", '-'],
707                ["\N{U+2018}", '&quot;'],
708                ["\N{U+2019}", '&quot;'],
709                ["\N{U+201C}", '&quot;'],
710                ["\N{U+201D}", '&quot;'],
711                ["\N{U+2026}", '...'],        # …
712                ["\N{U+25CF}", '&#x25CF;'],       # ●
713        );
714
715
6
6
        $string = _sub_map(\$string, \@byte_map);
716
717        # utf8::encode($string);
718        # $string =~ s/Å¡/&s#x161;/g;
719        # $string =~ s/č/&#x10D;/g;
720        # $string =~ s/ž/&z#x17E;/g;
721        # $string =~ s/é/&#x0E9;/g;
722        # $string =~ s/Ö/&#x0D6;/g;
723        # $string =~ s/ç/&#x0E7;/g;
724        # $string =~ s/\N{U+0161}/&#x161;/g;
725        # $string =~ s/\N{U+010D}/&#x10D;/g;
726        # $string =~ s/\N{U+017E}/&#x17E;/g;
727        # $string =~ s/\N{U+00E9}/&#x0E9;/g;
728        # $string =~ s/\N{U+00D6}/&#x0D6;/g;        # Ö
729        # $string =~ s/\N{U+00E7}/&#x0E7;/g;        # ç
730
731        # print STDERR __LINE__, ": ($string)";
732        # print STDERR (sprintf '%v02X', $string);
733        # print STDERR "\n";
734
735        # utf8::decode($string);
736
737        # $string =~ s/['\x98]/&#039;/g;
738
6
68
        @byte_map = (
739                ["'", '&#039;'],
740                ["\x98", '&#039;'],
741                ['©', '&#x0A9;'],
742                ['ª', '&#x0AA;'],
743                ['®', '&#x0AE;'],
744                ['Ã¥', '&#x0E5;'],
745                ['Å¡', '&#x161;'],
746                ['č', '&#x10D;'],
747                ['ž', '&#x17E;'],
748                ['£', '&#x0A3;'],
749                ['µ', '&#x0B5;'],
750                ['á', '&#x0E1;'],     # á
751                ['â', '&#x0E2;'],
752                ['ä', '&#x0E4;'],    # ä
753                ['Á', '&#x0C1;'],      # Á
754                ['Ö', '&#x0D6;'],
755                ['ß', '&#x0DF;'],
756                ['ç', '&#x0E7;'],
757                ['è', '&#x0E8;'],
758                ['é', '&#x0E9;'],
759                ['ê', '&#x0EA;'],
760                ['ë', '&#x0EB;'],
761                ['í', '&#x0ED;'],
762                ['ï', '&#x0EF;'],
763                ['ÃŽ', '&#x0CE;'],      # ÃŽ
764                ['Þ', '&#x0DE;'],      # Þ
765                ['ð', '&#x0F0;'],       # ð
766                ['ø', '&#x0F8;'],     # ø
767                ['û', '&#x0FB;'],
768                ['ñ', '&#x0F1;'],
769                ['ú', '&#x0FA;'],
770                ['ü', '&#x0FC;'],
771                ['þ', '&#x0FE;'],    # þ
772                ['“', '&quot;'],
773                ['”', '&quot;'],
774                ['«', '&quot;'],
775                ['»', '&quot;'],
776                ['—', '-'],
777                ['–', '-'],
778                ['…', '...'],
779                ['●', '&#x25CF;'],
780                ["\x80\$", ' '],
781        );
782
783
6
7
        $string = _sub_map(\$string, \@byte_map);
784
785        # if($string =~ /^Maria\(/) {
786                # print STDERR (unpack 'H*', $string);
787                # print STDERR __LINE__, ': ';
788                # print STDERR (sprintf '%v02X', $string);
789                # print STDERR "\n";
790                # my $i = 0;
791                # while((my @call_details = caller($i++))) {
792                        # print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
793                # }
794                # die $string;
795        # }
796
797        # print STDERR __LINE__, ": ($string)\n";
798
6
11
        if($string =~ /[^[:ascii:]]/) {
799
1
10
                print STDERR (unpack 'H*', $string);
800
1
3
                print STDERR __LINE__, ': ';
801
1
3
                print STDERR (sprintf '%v02X', $string);
802
1
2
                print STDERR "\n";
803
1
1
                my $i = 0;
804
1
4
                while((my @call_details = caller($i++))) {
805
2
48
                        print STDERR "\t", colored($call_details[2] . ' of ' . $call_details[1], 'red'), "\n";
806                }
807
1
30
                $complain->("TODO: wide_to_xml($string)") if($complain);
808
1
6
                warn "TODO: wide_to_xml($string)";
809                # $string =~ s/[^[:ascii:]]/XXXXX/g;
810
1
231
                $string =~ s{
811                                ([^[:ascii:]])
812                        }{
813
2
3
                                '>>>>' . sprintf("%04X", ord($1)) . '<<<<'
814                        }gex;   # e=evaluate, g=global, x=extended
815
1
4
                die "BUG: wide_to_xml($string)";
816        }
817
5
33
        return $string;
818}
819
820sub _sub_map
821{
822
72
72
42
56
        my $string = ${$_[0]};
823
72
53
        my $byte_map = $_[1];
824
825        # Build an alternation sorted by longest sequence first
826        my $pattern = join '|',
827
2226
1457
                map { quotemeta($_->[0]) }
828
3738
2000
                sort { length $b->[0] <=> length $a->[0] }
829
72
72
41
107
                @{$byte_map};
830
831
72
78
2557
40
        $string =~ s/($pattern)/do {
832
78
56
                my $bytes = $1;
833
78
3762
78
31
2072
56
                my ($pair) = grep { $_->[0] eq $bytes } @{$byte_map};
834
78
105
                $pair->[1];
835        }/ge;
836
837
72
157
        return $string;
838}
839
840 - 890
=head1 SEE ALSO

=over 4

=item * Test coverage report: L<https://nigelhorne.github.io/Encode-Wide/coverage/>

=item * L<HTML::Entities>

=item * L<Encode>

=item * L<XML::Entities>

=item * L<Unicode::Escape>

=item * L<https://www.compart.com/en/unicode/>

=back

=head1 SUPPORT

This module is provided as-is without any warranty.

Please report any bugs or feature requests to C<bug-encode-wide at rt.cpan.org>,
or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Encode-Wide>.
I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.

=head1 AUTHOR

Nigel Horne, C<< <njh at nigelhorne.com> >>

=head1 LICENCE AND COPYRIGHT

Copyright 2025 Nigel Horne.

Usage is subject to licence terms.

The licence terms of this software are as follows:

=over 4

=item * Personal single user, single computer use: GPL2

=item * All other users (including Commercial, Charity, Educational, Government)
  must apply in writing for a licence for use from Nigel Horne at the
  above e-mail.

=back

=cut
891
8921;