Files
Froxlor/lib/classes/idna/ext/IdnaConvert.php
Michael Kaufmann (d00p) b22e70804b update new IdnaConvert class with needed fixes in them
Signed-off-by: Michael Kaufmann (d00p) <d00p@froxlor.org>
2016-06-19 20:15:33 +02:00

406 lines
15 KiB
PHP

<?php
// {{{ license
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
//
// +----------------------------------------------------------------------+
// | This library is free software; you can redistribute it and/or modify |
// | it under the terms of the GNU Lesser General Public License as |
// | published by the Free Software Foundation; either version 2.1 of the |
// | License, or (at your option) any later version. |
// | |
// | This library is distributed in the hope that it will be useful, but |
// | WITHOUT ANY WARRANTY; without even the implied warranty of |
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
// | Lesser General Public License for more details. |
// | |
// | You should have received a copy of the GNU Lesser General Public |
// | License along with this library; if not, write to the Free Software |
// | Foundation, Inc., 51 Franklin St, Boston, MA 02110, United States |
// +----------------------------------------------------------------------+
//
// }}}
/**
* Encode/decode Internationalized Domain Names.
*
* The class allows to convert internationalized domain names
* (see RFC 3490 for details) as they can be used with various registries worldwide
* to be translated between their original (localized) form and their encoded form
* as it will be used in the DNS (Domain Name System).
*
* The class provides two public methods, encode() and decode(), which do exactly
* what you would expect them to do. You are allowed to use complete domain names,
* simple strings and complete email addresses as well. That means, that you might
* use any of the following notations:
*
* - www.nörgler.com
* - xn--nrgler-wxa
* - xn--brse-5qa.xn--knrz-1ra.info
*
* Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4 array.
* Unicode output is available in the same formats.
* You can select your preferred format via {@link set_paramter()}.
*
* ACE input and output is always expected to be ASCII.
*
* @author Matthias Sommerfeld <mso@phlylabs.de>
* @copyright 2004-2016 phlyLabs Berlin, http://phlylabs.de
* @version 1.0.1-dev 2016-01-12
*/
namespace Mso\IdnaConvert;
class IdnaConvert {
const Version = '1.1.0';
const SubVersion = 'main';
// Internal settings, do not touch!
protected $encoding = 'utf8'; // Default input charset is UTF-8
protected $strictMode = false; // Behave strict or not
protected $idnVersion = '2008'; // Can be either 2003 (old) or 2008 (default)
protected $NamePrepData = null;
protected $UnicodeTranscoder = null;
/**
* the constructor
*
* @param array|null $params Parameters to control the class' behaviour
* @since 0.5.2
*/
public function __construct($params = null)
{
$this->UnicodeTranscoder = new UnicodeTranscoder();
// Kept for backwarsds compatibility. Consider using the setter methods instead.
if (!empty($params) && is_array($params)) {
if (isset($params['encoding'])) {
$this->setEncoding($params['encoding']);
}
if (isset($params['idn_version'])) {
$this->setIdnVersion($params['idn_version']);
}
if (isset($params['strict_mode'])) {
$this->setStrictMode($params['strict_mode']);
}
}
$this->setIdnVersion($this->idnVersion);
}
public function getClassVersion()
{
return self::Version.'-'.self::SubVersion;
}
/**
* @return string
*/
public function getEncoding()
{
return $this->encoding;
}
/**
* @param string $encoding
*/
public function setEncoding($encoding)
{
switch ($encoding) {
case 'utf8':
case 'ucs4_string':
case 'ucs4_array':
$this->encoding = $encoding;
break;
default:
throw new \InvalidArgumentException(sprintf('Invalid encoding %s', $encoding));
}
}
/**
* @return boolean
*/
public function isStrictMode()
{
return $this->strictMode;
}
/**
* @param boolean $strictMode
*/
public function setStrictMode($strictMode)
{
$this->strictMode = ($strictMode) ? true : false;
}
/**
* @return int
*/
public function getIdnVersion()
{
return $this->idnVersion;
}
/**
* @param int $idnVersion
*/
public function setIdnVersion($idnVersion)
{
if (in_array($idnVersion, ['2003', '2008'])) {
if (is_null($this->NamePrepData) || $idnVersion != $this->idnVersion) {
$this->NamePrepData = null; // Ought to destroy the object's reference
// Re-instantiate with different data set
$this->NamePrepData = ($idnVersion == 2003)
? new NamePrepData2003()
: new NamePrepData();
}
$this->idnVersion = $idnVersion;
} else {
throw new \InvalidArgumentException(sprintf('Invalid IDN version %d', $idnVersion));
}
}
/**
* Decode a given ACE domain name
* @param string $input Domain name (ACE string)
* [@param string $one_time_encoding Desired output encoding]
* @return string Decoded Domain name (UTF-8 or UCS-4)
*/
public function decode($input, $one_time_encoding = null)
{
$punyCode = $this->punycodeFactory();
// Optionally set
if ($one_time_encoding) {
switch ($one_time_encoding) {
case 'utf8':
case 'ucs4_string':
case 'ucs4_array':
break;
default:
throw new \InvalidArgumentException(sprintf('Invalid encoding %s', $one_time_encoding));
}
}
// Make sure to drop any newline characters around
$input = trim($input);
// Negotiate input and try to determine, whether it is a plain string,
// an email address or something like a complete URL
if (strpos($input, '@')) { // Maybe it is an email address
// No no in strict mode
if ($this->strictMode) {
throw new \InvalidArgumentException('Only individual domain name parts can be handled in strict mode');
}
list ($email_pref, $input) = explode('@', $input, 2);
$arr = explode('.', $input);
foreach ($arr as $k => $v) {
$conv = $punyCode->decode($v);
if ($conv) {
$arr[$k] = $conv;
}
}
$input = join('.', $arr);
$arr = explode('.', $email_pref);
foreach ($arr as $k => $v) {
$conv = $punyCode->decode($v);
if ($conv) {
$arr[$k] = $conv;
}
}
$email_pref = join('.', $arr);
$return = $email_pref . '@' . $input;
} elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
// No no in strict mode
if ($this->strictMode) {
throw new \InvalidArgumentException('Only individual domain name parts can be handled in strict mode');
}
$parsed = parse_url($input);
if (isset($parsed['host'])) {
$arr = explode('.', $parsed['host']);
foreach ($arr as $k => $v) {
$conv = $punyCode->decode($v);
if ($conv) {
$arr[$k] = $conv;
}
}
$parsed['host'] = join('.', $arr);
$return = (empty($parsed['scheme']) ? '' : $parsed['scheme'] . (strtolower($parsed['scheme']) == 'mailto' ? ':' : '://')).
(empty($parsed['user']) ? '' : $parsed['user'] . (empty($parsed['pass']) ? '' : ':' . $parsed['pass']) . '@').
$parsed['host'].
(empty($parsed['port']) ? '' : ':' . $parsed['port']).
(empty($parsed['path']) ? '' : $parsed['path']).
(empty($parsed['query']) ? '' : '?' . $parsed['query']).
(empty($parsed['fragment']) ? '' : '#' . $parsed['fragment']);
} else { // parse_url seems to have failed, try without it
$arr = explode('.', $input);
foreach ($arr as $k => $v) {
$conv = $punyCode->decode($v);
if ($conv) {
$arr[$k] = $conv;
}
}
$return = join('.', $arr);
}
} else { // Otherwise we consider it being a pure domain name string
$return = $punyCode->decode($input);
if (!$return) {
$return = $input;
}
}
// The output is UTF-8 by default, other output formats need conversion here
// If one time encoding is given, use this, else the objects property
$outputEncoding = ($one_time_encoding) ? $one_time_encoding : $this->encoding;
switch ($outputEncoding) {
case 'utf8':
return $return; // break;
case 'ucs4_string':
return $this->UnicodeTranscoder->convert($return, 'utf8', 'ucs4'); // break;
case 'ucs4_array':
return $this->UnicodeTranscoder->convert($return, 'utf8', 'ucs4array'); // break;
default:
throw new \InvalidArgumentException(sprintf('Unsupported output encoding %s', $outputEncoding));
}
}
/**
* Encode a given UTF-8 domain name
* @param string $decoded Domain name (UTF-8 or UCS-4)
* [@param boolean $one_time_encoding Desired input encoding, see {@link set_parameter}]
* @return string Encoded Domain name (ACE string)
*/
public function encode($decoded, $one_time_encoding = false)
{
// Forcing conversion of input to UCS4 array
// If one time encoding is given, use this, else the objects property
$inputEncoding = $one_time_encoding ? $one_time_encoding : $this->encoding;
switch ($inputEncoding) {
case 'utf8':
$decoded = $this->UnicodeTranscoder->convert($decoded, 'utf8', 'ucs4array');
break;
case 'ucs4_string':
$decoded = $this->UnicodeTranscoder->convert($decoded, 'ucs4', 'ucs4array');
break;
case 'ucs4_array':
break;
default:
throw new \InvalidArgumentException(sprintf('Unsupported input encoding %s', $inputEncoding));
}
// No input, no output, what else did you expect?
if (empty($decoded)) {
return '';
}
$punyCode = $this->punycodeFactory();
// Anchors for iteration
$last_begin = 0;
// Output string
$output = '';
foreach ($decoded as $k => $v) {
// Make sure to use just the plain dot
switch ($v) {
case 0x3002:
case 0xFF0E:
case 0xFF61:
$decoded[$k] = 0x2E;
// Right, no break here, the above are converted to dots anyway
// Stumbling across an anchoring character
case 0x2E:
case 0x2F:
case 0x3A:
case 0x3F:
case 0x40:
// Neither email addresses nor URLs allowed in strict mode
if ($this->strictMode) {
throw new \InvalidArgumentException('Neither email addresses nor URLs are allowed in strict mode.');
} else {
// Skip first char
if ($k) {
$encoded = $punyCode->encode(array_slice($decoded, $last_begin, (($k) - $last_begin)));
if ($encoded) {
$output .= $encoded;
} else {
$output .= $this->UnicodeTranscoder->convert(array_slice($decoded, $last_begin, (($k) - $last_begin)), 'ucs4array', 'utf8');
}
$output .= chr($decoded[$k]);
}
$last_begin = $k + 1;
}
}
}
// Catch the rest of the string
if ($last_begin) {
$inp_len = sizeof($decoded);
$encoded = $punyCode->encode(array_slice($decoded, $last_begin, (($inp_len) - $last_begin)));
if ($encoded) {
$output .= $encoded;
} else {
$output .= $this->UnicodeTranscoder->convert(array_slice($decoded, $last_begin, (($inp_len) - $last_begin)), 'ucs4array', 'utf8');
}
return $output;
} else {
if (false !== ($output = $punyCode->encode($decoded))) {
return $output;
} else {
return $this->UnicodeTranscoder->convert($decoded, 'ucs4array', 'utf8');
}
}
}
/**
* Mitigates a weakness of encode(), which cannot properly handle URIs but instead encodes their
* path or query components, too.
* @param string $uri Expects the URI as a UTF-8 (or ASCII) string
* @return string The URI encoded to Punycode, everything but the host component is left alone
* @since 0.6.4
*/
public function encodeUri($uri)
{
$parsed = parse_url($uri);
if (!isset($parsed['host'])) {
throw new \InvalidArgumentException('The given string does not look like a URI');
}
$arr = explode('.', $parsed['host']);
foreach ($arr as $k => $v) {
$conv = $this->encode($v, 'utf8');
if ($conv) {
$arr[$k] = $conv;
}
}
$parsed['host'] = join('.', $arr);
$return = (empty($parsed['scheme']) ? '' : $parsed['scheme'] . (strtolower($parsed['scheme']) == 'mailto' ? ':' : '://')).
(empty($parsed['user']) ? '' : $parsed['user'] . (empty($parsed['pass']) ? '' : ':' . $parsed['pass']) . '@').
$parsed['host'].
(empty($parsed['port']) ? '' : ':' . $parsed['port']).
(empty($parsed['path']) ? '' : $parsed['path']).
(empty($parsed['query']) ? '' : '?' . $parsed['query']).
(empty($parsed['fragment']) ? '' : '#' . $parsed['fragment']);
return $return;
}
/**
* The actual punycode class is rather costly, as well as passing the huge nameprep database around.
* This factory method allows to ease the burden when dealing with multiple IDN versions.
*
* @return \Mso\IdnaConvert\Punycode
*/
protected function punycodeFactory()
{
static $instances = [];
if (!isset($instances[$this->idnVersion])) {
$instances[$this->idnVersion] = new Punycode($this->NamePrepData, $this->UnicodeTranscoder);
}
return $instances[$this->idnVersion];
}
}