- <?php
- * @file
- * Transliteration processing functions.
- */
-
- * Transliterates UTF-8 encoded text to US-ASCII.
- *
- * Based on Mediawiki's UtfNormal::quickIsNFCVerify().
- *
- * @param $string
- * UTF-8 encoded text input.
- * @param $unknown
- * Replacement string for characters that do not have a suitable ASCII
- * equivalent.
- * @param $source_langcode
- * Optional ISO 639 language code that denotes the language of the input and
- * is used to apply language-specific variations. If the source language is
- * not known at the time of transliteration, it is recommended to set this
- * argument to the site default language to produce consistent results.
- * Otherwise the current display language will be used.
- * @return
- * Transliterated text.
- */
- function _transliteration_process($string, $unknown = '?', $source_langcode = NULL) {
-
-
-
- if (!preg_match('/[\x80-\xff]/', $string)) {
- return $string;
- }
-
- static $tail_bytes;
-
- if (!isset($tail_bytes)) {
-
- $tail_bytes = array();
- for ($n = 0; $n < 256; $n++) {
- if ($n < 0xc0) {
- $remaining = 0;
- }
- elseif ($n < 0xe0) {
- $remaining = 1;
- }
- elseif ($n < 0xf0) {
- $remaining = 2;
- }
- elseif ($n < 0xf8) {
- $remaining = 3;
- }
- elseif ($n < 0xfc) {
- $remaining = 4;
- }
- elseif ($n < 0xfe) {
- $remaining = 5;
- }
- else {
- $remaining = 0;
- }
- $tail_bytes[chr($n)] = $remaining;
- }
- }
-
-
-
-
- preg_match_all('/[\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches);
-
- $result = '';
- foreach ($matches[0] as $str) {
- if ($str[0] < "\x80") {
-
-
- $result .= $str;
- continue;
- }
-
-
-
-
-
-
-
-
- $head = '';
- $chunk = strlen($str);
-
- $len = $chunk + 1;
-
- for ($i = -1; --$len; ) {
- $c = $str[++$i];
- if ($remaining = $tail_bytes[$c]) {
-
- $sequence = $head = $c;
- do {
-
- if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") {
-
- $sequence .= $c;
- }
- else {
- if ($len == 0) {
-
-
- $result .= $unknown;
- break 2;
- }
- else {
-
- $result .= $unknown;
-
-
- --$i;
- ++$len;
- continue 2;
- }
- }
- } while (--$remaining);
-
- $n = ord($head);
- $ord = NULL;
- if ($n <= 0xdf) {
- $ord = ($n - 192) * 64 + (ord($sequence[1]) - 128);
- }
- elseif ($n <= 0xef) {
- $ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128);
- }
- elseif ($n <= 0xf7) {
- $ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128);
- }
- elseif ($n <= 0xfb) {
- $ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128);
- }
- elseif ($n <= 0xfd) {
- $ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128);
- }
- $result .= _transliteration_replace($ord, $unknown, $source_langcode);
- $head = '';
- }
- elseif ($c < "\x80") {
-
- $result .= $c;
- $head = '';
- }
- elseif ($c < "\xc0") {
-
- if ($head == '') {
- $result .= $unknown;
- }
- }
- else {
-
- $result .= $unknown;
- $head = '';
- }
- }
- }
- return $result;
- }
-
- * Replaces a Unicode character using the transliteration database.
- *
- * @param $ord
- * An ordinal Unicode character code.
- * @param $unknown
- * Replacement string for characters that do not have a suitable ASCII
- * equivalent.
- * @param $langcode
- * Optional ISO 639 language code that denotes the language of the input and
- * is used to apply language-specific variations. Defaults to the current
- * display language.
- * @return
- * ASCII replacement character.
- */
- function _transliteration_replace($ord, $unknown = '?', $langcode = NULL) {
- static $map = array();
- static $language_overrides = array();
-
- if (!isset($langcode)) {
- global $language;
- $langcode = $language->langcode;
- }
-
-
- if (!isset($language_overrides[$langcode])) {
- $file = BACKDROP_ROOT . '/core/includes/transliteration/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';
- if (file_exists($file)) {
- include $file;
- if (!isset($overrides[$langcode]) || !is_array($overrides[$langcode])) {
- $overrides[$langcode] = array();
- }
- $language_overrides[$langcode] = $overrides[$langcode];
- }
- }
- if (isset($language_overrides[$langcode][$ord])) {
- return $language_overrides[$langcode][$ord];
- }
-
- $bank = $ord >> 8;
-
- if (!isset($map[$bank][$langcode])) {
- $base = array();
- $file = BACKDROP_ROOT . '/core/includes/transliteration/' . sprintf('x%02x', $bank) . '.php';
- if (file_exists($file)) {
- include $file;
- $map[$bank][$langcode] = $base;
- }
- else {
- $map[$bank][$langcode] = array();
- }
- }
-
- $ord = $ord & 255;
-
- return isset($map[$bank][$langcode][$ord]) ? $map[$bank][$langcode][$ord] : $unknown;
- }
-
- * Transliterates and sanitizes a file name.
- *
- * The resulting file name has white space replaced with underscores, consists
- * of only US-ASCII characters, and is converted to lowercase (if configured).
- * If multiple files have been submitted as an array, the names will be
- * processed recursively.
- *
- * @param $filename
- * A file name, or an array of file names.
- * @param $source_langcode
- * Optional ISO 639 language code that denotes the language of the input and
- * is used to apply language-specific variations. If the source language is
- * not known at the time of transliteration, it is recommended to set this
- * argument to the site default language to produce consistent results.
- * Otherwise the current display language will be used.
- * @return
- * Sanitized file name, or array of sanitized file names.
- *
- * @see language_default()
- */
- function transliteration_clean_filename($filename, $source_langcode = NULL) {
- if (is_array($filename)) {
- foreach ($filename as $key => $value) {
- $filename[$key] = transliteration_clean_filename($value, $source_langcode);
- }
- return $filename;
- }
-
-
- backdrop_alter('transliteration_clean_filename_prepare', $filename, $source_langcode);
- $filename = transliteration_get($filename, '', $source_langcode);
- $filename = str_replace(' ', '_', $filename);
- $filename = preg_replace('/[^a-z0-9_.-]+/i', '', $filename);
-
- if (config_get('system.core', 'file_transliterate_lowercase')) {
- $filename = strtolower($filename);
- }
- return $filename;
- }
-
- * Transliterates text.
- *
- * Takes an input string in any language and character set, and tries to
- * represent it in US-ASCII characters by conveying, in Roman letters, the
- * pronunciation expressed by the text in some other writing system.
- *
- * @param string $text
- * UTF-8 encoded text input.
- * @param string $unknown
- * Replacement string for characters that do not have a suitable ASCII
- * equivalent.
- * @param string $source_langcode
- * Optional ISO 639 language code that denotes the language of the input and
- * is used to apply language-specific variations. If the source language is
- * not known at the time of transliteration, it is recommended to set this
- * argument to the site default language to produce consistent results.
- * Otherwise the current display language will be used.
- * @return
- * Transliterated text.
- *
- * @see language_default()
- */
- function transliteration_get($text, $unknown = '?', $source_langcode = NULL) {
-
- if (empty($source_langcode)) {
- $source_langcode = $GLOBALS['language']->langcode;
- }
- return _transliteration_process($text, $unknown, $source_langcode);
- }
-
- function transliteration_remove_diacritics($string) {
- $result = '';
-
- foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
- $code = _transliteration_ord_utf8($character);
-
-
-
-
- $range1 = $code > 0x00bf && $code < 0x017f;
- $exclusions_range1 = array(0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b);
- $range2 = $code > 0x01cc && $code < 0x0250;
- $exclusions_range2 = array(0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245);
-
- $replacement = $character;
- if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) {
- $to_add = transliteration_get($code, 'xyz');
- if(strlen($to_add) === 1) {
- $replacement = $to_add;
- }
- }
-
- $result .= $replacement;
- }
-
- return $result;
- }
-
- * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
- *
- * @param string $character
- * A single UTF-8 character.
- *
- * @return int
- * The character code, or -1 if an illegal character is found.
- */
- function _transliteration_ord_utf8($character) {
- $first_byte = ord($character[0]);
-
- if (($first_byte & 0x80) == 0) {
-
- return $first_byte;
- }
- if (($first_byte & 0xe0) == 0xc0) {
-
- return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
- }
- if (($first_byte & 0xf0) == 0xe0) {
-
- return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
- }
- if (($first_byte & 0xf8) == 0xf0) {
-
- return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
- }
-
-
- return -1;
- }