Есть ли какая-то функция, которую я могу использовать в PHP или javascript для преобразования ВСЕХ HTML-объектов в их десятичные эквиваленты?


Вопрос в названии. Я использую возможности синтаксического анализа XML jQuery для обработки XML, который обычно содержит специальные символы HTML в формате  , что обычно нарушает работу моего приложения, поскольку jQuery не распознает его как допустимый XML.

Чтобы избежать этого на данный момент, я просто обрабатываю XML с помощью PHP, прежде чем он будет передан на сторону клиента - вот фрагмент моего кода:

$fixedmessage = str_replace('Â', 'Â', htmlentities($MessageText[$j], ENT_COMPAT, "UTF-8" ));
$fixedmessage = str_replace('£', '£', $fixedmessage);
$fixedmessage = str_replace('Ã', 'Ã', $fixedmessage);
$fixedmessage = str_replace('¡', '¡', $fixedmessage);
$fixedmessage = str_replace('á', 'á', $fixedmessage);
$fixedmessage = str_replace('í', 'í', $fixedmessage);
...

Поскольку функция PHP htmlentities кажется почти бесполезной для всех, кроме абсолютных основ, Я просто запускаю ручную замену для каждого специального символа, когда это становится проблемой, но это не особенно элегантно или, я бы предположил, особенно эффективный способ делать что-то. Есть ли лучший способ?

Author: Xenthide, 2015-06-05

2 answers

Как насчет того, чтобы сначала декодировать их, а затем перекодировать в режиме XML:

htmlentities(
    html_entity_decode($str),
    ENT_XML1);

И вот простое решение:

function decode_named_entities($str) {
    static $entities = array( "Aacute"=>"00C1", "aacute"=>"00E1", "Acirc"=>"00C2", "acirc"=>"00E2",
    "acute"=>"00B4", "AElig"=>"00C6", "aelig"=>"00E6", "Agrave"=>"00C0", "agrave"=>"00E0",
    "alefsym"=>"2135", "Alpha"=>"0391", "alpha"=>"03B1", "amp"=>"0026", "and"=>"2227", "ang"=>"2220",
    "apos"=>"0027", "Aring"=>"00C5", "aring"=>"00E5", "asymp"=>"2248", "Atilde"=>"00C3",
    "atilde"=>"00E3", "Auml"=>"00C4", "auml"=>"00E4", "bdquo"=>"201E", "Beta"=>"0392", "beta"=>"03B2",
    "brvbar"=>"00A6", "bull"=>"2022", "cap"=>"2229", "Ccedil"=>"00C7", "ccedil"=>"00E7",
    "cedil"=>"00B8", "cent"=>"00A2", "Chi"=>"03A7", "chi"=>"03C7", "circ"=>"02C6", "clubs"=>"2663",
    "cong"=>"2245", "copy"=>"00A9", "crarr"=>"21B5", "cup"=>"222A", "curren"=>"00A4", "dagger"=>"2020",
    "Dagger"=>"2021", "darr"=>"2193", "dArr"=>"21D3", "deg"=>"00B0", "Delta"=>"0394", "delta"=>"03B4",
    "diams"=>"2666", "divide"=>"00F7", "Eacute"=>"00C9", "eacute"=>"00E9", "Ecirc"=>"00CA",
    "ecirc"=>"00EA", "Egrave"=>"00C8", "egrave"=>"00E8", "empty"=>"2205", "emsp"=>"2003",
    "ensp"=>"2002", "Epsilon"=>"0395", "epsilon"=>"03B5", "equiv"=>"2261", "Eta"=>"0397",
    "eta"=>"03B7", "ETH"=>"00D0", "eth"=>"00F0", "Euml"=>"00CB", "euml"=>"00EB", "euro"=>"20AC",
    "exist"=>"2203", "fnof"=>"0192", "forall"=>"2200", "frac12"=>"00BD", "frac14"=>"00BC",
    "frac34"=>"00BE", "frasl"=>"2044", "Gamma"=>"0393", "gamma"=>"03B3", "ge"=>"2265", "gt"=>"003E",
    "harr"=>"2194", "hArr"=>"21D4", "hearts"=>"2665", "hellip"=>"2026", "Iacute"=>"00CD",
    "iacute"=>"00ED", "Icirc"=>"00CE", "icirc"=>"00EE", "iexcl"=>"00A1", "Igrave"=>"00CC",
    "igrave"=>"00EC", "image"=>"2111", "infin"=>"221E", "int"=>"222B", "Iota"=>"0399", "iota"=>"03B9",
    "iquest"=>"00BF", "isin"=>"2208", "Iuml"=>"00CF", "iuml"=>"00EF", "Kappa"=>"039A", "kappa"=>"03BA",
    "Lambda"=>"039B", "lambda"=>"03BB", "lang"=>"2329", "laquo"=>"00AB", "larr"=>"2190",
    "lArr"=>"21D0", "lceil"=>"2308", "ldquo"=>"201C", "le"=>"2264", "lfloor"=>"230A", "lowast"=>"2217",
    "loz"=>"25CA", "lrm"=>"200E", "lsaquo"=>"2039", "lsquo"=>"2018", "lt"=>"003C", "macr"=>"00AF",
    "mdash"=>"2014", "micro"=>"00B5", "middot"=>"00B7", "minus"=>"2212", "Mu"=>"039C", "mu"=>"03BC",
    "nabla"=>"2207", "nbsp"=>"00A0", "ndash"=>"2013", "ne"=>"2260", "ni"=>"220B", "not"=>"00AC",
    "notin"=>"2209", "nsub"=>"2284", "Ntilde"=>"00D1", "ntilde"=>"00F1", "Nu"=>"039D", "nu"=>"03BD",
    "Oacute"=>"00D3", "oacute"=>"00F3", "Ocirc"=>"00D4", "ocirc"=>"00F4", "OElig"=>"0152",
    "oelig"=>"0153", "Ograve"=>"00D2", "ograve"=>"00F2", "oline"=>"203E", "Omega"=>"03A9",
    "omega"=>"03C9", "Omicron"=>"039F", "omicron"=>"03BF", "oplus"=>"2295", "or"=>"2228",
    "ordf"=>"00AA", "ordm"=>"00BA", "Oslash"=>"00D8", "oslash"=>"00F8", "Otilde"=>"00D5",
    "otilde"=>"00F5", "otimes"=>"2297", "Ouml"=>"00D6", "ouml"=>"00F6", "para"=>"00B6", "part"=>"2202",
    "permil"=>"2030", "perp"=>"22A5", "Phi"=>"03A6", "phi"=>"03C6", "Pi"=>"03A0", "pi"=>"03C0",
    "piv"=>"03D6", "plusmn"=>"00B1", "pound"=>"00A3", "prime"=>"2032", "Prime"=>"2033", "prod"=>"220F",
    "prop"=>"221D", "Psi"=>"03A8", "psi"=>"03C8", "quot"=>"0022", "radic"=>"221A", "rang"=>"232A",
    "raquo"=>"00BB", "rarr"=>"2192", "rArr"=>"21D2", "rceil"=>"2309", "rdquo"=>"201D", "real"=>"211C",
    "reg"=>"00AE", "rfloor"=>"230B", "Rho"=>"03A1", "rho"=>"03C1", "rlm"=>"200F", "rsaquo"=>"203A",
    "rsquo"=>"2019", "sbquo"=>"201A", "Scaron"=>"0160", "scaron"=>"0161", "sdot"=>"22C5",
    "sect"=>"00A7", "shy"=>"00AD", "Sigma"=>"03A3", "sigma"=>"03C3", "sigmaf"=>"03C2", "sim"=>"223C",
    "spades"=>"2660", "sub"=>"2282", "sube"=>"2286", "sum"=>"2211", "sup"=>"2283", "sup1"=>"00B9",
    "sup2"=>"00B2", "sup3"=>"00B3", "supe"=>"2287", "szlig"=>"00DF", "Tau"=>"03A4", "tau"=>"03C4",
    "there4"=>"2234", "Theta"=>"0398", "theta"=>"03B8", "thetasym"=>"03D1", "thinsp"=>"2009",
    "THORN"=>"00DE", "thorn"=>"00FE", "tilde"=>"02DC", "times"=>"00D7", "trade"=>"2122",
    "Uacute"=>"00DA", "uacute"=>"00FA", "uarr"=>"2191", "uArr"=>"21D1", "Ucirc"=>"00DB",
    "ucirc"=>"00FB", "Ugrave"=>"00D9", "ugrave"=>"00F9", "uml"=>"00A8", "upsih"=>"03D2",
    "Upsilon"=>"03A5", "upsilon"=>"03C5", "Uuml"=>"00DC", "uuml"=>"00FC", "weierp"=>"2118",
    "Xi"=>"039E", "xi"=>"03BE", "Yacute"=>"00DD", "yacute"=>"00FD", "yen"=>"00A5", "yuml"=>"00FF",
    "Yuml"=>"0178", "Zeta"=>"0396", "zeta"=>"03B6", "zwj"=>"200D", "zwnj"=>"200C");

    return preg_replace_callback('~&([A-Za-z]+);~',
        function($m) use($entities) {
            $e = $m[1];
            return isset($entities[$e]) ? "&#x{$entities[$e]};" : "&$e;";
        },
        $str
    );
}

Более быстрым способом было бы сгенерировать два массива из приведенного выше:

$search = ["Á", "á" etc
$replac = ["Á", "á" etc

И применить str_replace.

 1
Author: georg, 2015-06-05 09:47:16
<?php
$entities = getEntities();
$from = array();
$to = array();

foreach ($entities as $ent => $code) {
    $from[] = '&' . $ent . ';';
    $to[] = '&#' . $code . ';';
}

$str = '&xi; and &tau;';

// you can pass arrays to str_replace
// see http://php.net/manual/en/function.str-replace.php
echo str_replace($from, $to, $str);

function getEntities() {
    // http://www.mit.edu/afs.new/sipb/project/php/include/entities.h
    return array(
        'AElig'    => 198,
        'Aacute'   => 193,
        'Acirc'    => 194,
        'Agrave'   => 192,
        'Alpha'    => 913,
        'Aring'    => 197,
        'Atilde'   => 195,
        'Auml'     => 196,
        'Beta'     => 914,
        'Ccedil'   => 199,
        'Chi'      => 935,
        'Dagger'   => 8225,
        'Delta'    => 916,
        'ETH'      => 208,
        'Eacute'   => 201,
        'Ecirc'    => 202,
        'Egrave'   => 200,
        'Epsilon'  => 917,
        'Eta'      => 919,
        'Euml'     => 203,
        'Gamma'    => 915,
        'Iacute'   => 205,
        'Icirc'    => 206,
        'Igrave'   => 204,
        'Iota'     => 921,
        'Iuml'     => 207,
        'Kappa'    => 922,
        'Lambda'   => 923,
        'Mu'       => 924,
        'Ntilde'   => 209,
        'Nu'       => 925,
        'OElig'    => 338,
        'Oacute'   => 211,
        'Ocirc'    => 212,
        'Ograve'   => 210,
        'Omega'    => 937,
        'Omicron'  => 927,
        'Oslash'   => 216,
        'Otilde'   => 213,
        'Ouml'     => 214,
        'Phi'      => 934,
        'Pi'       => 928,
        'Prime'    => 8243,
        'Psi'      => 936,
        'Rho'      => 929,
        'Scaron'   => 352,
        'Sigma'    => 931,
        'THORN'    => 222,
        'Tau'      => 932,
        'Theta'    => 920,
        'Uacute'   => 218,
        'Ucirc'    => 219,
        'Ugrave'   => 217,
        'Upsilon'  => 933,
        'Uuml'     => 220,
        'Xi'       => 926,
        'Yacute'   => 221,
        'Yuml'     => 376,
        'Zeta'     => 918,
        'aacute'   => 225,
        'acirc'    => 226,
        'acute'    => 180,
        'aelig'    => 230,
        'agrave'   => 224,
        'alefsym'  => 8501,
        'alpha'    => 945,
        'amp'      => 38,
        'and'      => 8743,
        'ang'      => 8736,
        'aring'    => 229,
        'asymp'    => 8776,
        'atilde'   => 227,
        'auml'     => 228,
        'bdquo'    => 8222,
        'beta'     => 946,
        'brvbar'   => 166,
        'bull'     => 8226,
        'cap'      => 8745,
        'ccedil'   => 231,
        'cedil'    => 184,
        'cent'     => 162,
        'chi'      => 967,
        'circ'     => 710,
        'clubs'    => 9827,
        'cong'     => 8773,
        'copy'     => 169,
        'crarr'    => 8629,
        'cup'      => 8746,
        'curren'   => 164,
        'dArr'     => 8659,
        'dagger'   => 8224,
        'darr'     => 8595,
        'deg'      => 176,
        'delta'    => 948,
        'diams'    => 9830,
        'divide'   => 247,
        'eacute'   => 233,
        'ecirc'    => 234,
        'egrave'   => 232,
        'empty'    => 8709,
        'emsp'     => 8195,
        'ensp'     => 8194,
        'epsilon'  => 949,
        'equiv'    => 8801,
        'eta'      => 951,
        'eth'      => 240,
        'euml'     => 235,
        'euro'     => 8364,
        'exist'    => 8707,
        'fnof'     => 402,
        'forall'   => 8704,
        'frac12'   => 189,
        'frac14'   => 188,
        'frac34'   => 190,
        'frasl'    => 8260,
        'gamma'    => 947,
        'ge'       => 8805,
        'gt'       => 62,
        'hArr'     => 8660,
        'harr'     => 8596,
        'hearts'   => 9829,
        'hellip'   => 8230,
        'iacute'   => 237,
        'icirc'    => 238,
        'iexcl'    => 161,
        'igrave'   => 236,
        'image'    => 8465,
        'infin'    => 8734,
        'int'      => 8747,
        'iota'     => 953,
        'iquest'   => 191,
        'isin'     => 8712,
        'iuml'     => 239,
        'kappa'    => 954,
        'lArr'     => 8656,
        'lambda'   => 955,
        'lang'     => 9001,
        'laquo'    => 171,
        'larr'     => 8592,
        'lceil'    => 8968,
        'ldquo'    => 8220,
        'le'       => 8804,
        'lfloor'   => 8970,
        'lowast'   => 8727,
        'loz'      => 9674,
        'lrm'      => 8206,
        'lsaquo'   => 8249,
        'lsquo'    => 8216,
        'lt'       => 60,
        'macr'     => 175,
        'mdash'    => 8212,
        'micro'    => 181,
        'middot'   => 183,
        'minus'    => 8722,
        'mu'       => 956,
        'nabla'    => 8711,
        'nbsp'     => 160,
        'ndash'    => 8211,
        'ne'       => 8800,
        'ni'       => 8715,
        'not'      => 172,
        'notin'    => 8713,
        'nsub'     => 8836,
        'ntilde'   => 241,
        'nu'       => 957,
        'oacute'   => 243,
        'ocirc'    => 244,
        'oelig'    => 339,
        'ograve'   => 242,
        'oline'    => 8254,
        'omega'    => 969,
        'omicron'  => 959,
        'oplus'    => 8853,
        'or'       => 8744,
        'ordf'     => 170,
        'ordm'     => 186,
        'oslash'   => 248,
        'otilde'   => 245,
        'otimes'   => 8855,
        'ouml'     => 246,
        'para'     => 182,
        'part'     => 8706,
        'permil'   => 8240,
        'perp'     => 8869,
        'phi'      => 966,
        'pi'       => 960,
        'piv'      => 982,
        'plusmn'   => 177,
        'pound'    => 163,
        'prime'    => 8242,
        'prod'     => 8719,
        'prop'     => 8733,
        'psi'      => 968,
        'quot'     => 34,
        'rArr'     => 8658,
        'radic'    => 8730,
        'rang'     => 9002,
        'raquo'    => 187,
        'rarr'     => 8594,
        'rceil'    => 8969,
        'rdquo'    => 8221,
        'real'     => 8476,
        'reg'      => 174,
        'rfloor'   => 8971,
        'rho'      => 961,
        'rlm'      => 8207,
        'rsaquo'   => 8250,
        'rsquo'    => 8217,
        'sbquo'    => 8218,
        'scaron'   => 353,
        'sdot'     => 8901,
        'sect'     => 167,
        'shy'      => 173,
        'sigma'    => 963,
        'sigmaf'   => 962,
        'sim'      => 8764,
        'spades'   => 9824,
        'sub'      => 8834,
        'sube'     => 8838,
        'sum'      => 8721,
        'sup'      => 8835,
        'sup1'     => 185,
        'sup2'     => 178,
        'sup3'     => 179,
        'supe'     => 8839,
        'szlig'    => 223,
        'tau'      => 964,
        'there4'   => 8756,
        'theta'    => 952,
        'thetasym' => 977,
        'thinsp'   => 8201,
        'thorn'    => 254,
        'tilde'    => 732,
        'times'    => 215,
        'trade'    => 8482,
        'uArr'     => 8657,
        'uacute'   => 250,
        'uarr'     => 8593,
        'ucirc'    => 251,
        'ugrave'   => 249,
        'uml'      => 168,
        'upsih'    => 978,
        'upsilon'  => 965,
        'uuml'     => 252,
        'weierp'   => 8472,
        'xi'       => 958,
        'yacute'   => 253,
        'yen'      => 165,
        'yuml'     => 255,
        'zeta'     => 950,
        'zwj'      => 8205,
        'zwnj'     => 8204,
    );
}

?>
 1
Author: Анатолий Ивашов, 2015-06-05 09:38:11