1: <?php defined('_JOOS_CORE') or exit();
2:
3: /**
4: * Библиотека работы со строками в правильной кодировке UTF-8
5: *
6: * @version 1.0
7: * @package Core\Libraries
8: * @subpackage String
9: * @category Libraries
10: * @author Joostina Team <info@joostina.ru>
11: * @copyright (C) 2007-2012 Joostina Team
12: * @license MIT License http://www.opensource.org/licenses/mit-license.php
13: * Информация об авторах и лицензиях стороннего кода в составе Joostina CMS: docs/copyrights
14: *
15: * @author Kohana Team
16: * @copyright (c) 2007 Kohana Team
17: * @copyright (c) 2005 Harry Fuecks
18: * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt
19: *
20: * */
21: class joosString
22: {
23: // Called methods
24: public static $called = array();
25:
26: /**
27: * Tests whether a string contains only 7bit ASCII bytes. This is used to
28: * determine when to use native functions or UTF-8 functions.
29: *
30: * @param string string to check
31: *
32: * @return bool
33: */
34: public static function is_ascii($str)
35: {
36: return !preg_match('/[^\x00-\x7F]/S', $str);
37: }
38:
39: /**
40: * Strips out device control codes in the ASCII range.
41: *
42: * @param string string to clean
43: *
44: * @return string
45: */
46: public static function strip_ascii_ctrl($str)
47: {
48: return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S', '', $str);
49: }
50:
51: /**
52: * Strips out all non-7bit ASCII bytes.
53: *
54: * @param string string to clean
55: *
56: * @return string
57: */
58: public static function strip_non_ascii($str)
59: {
60: return preg_replace('/[^\x00-\x7F]+/S', '', $str);
61: }
62:
63: /**
64: * Returns the length of the given string.
65: * @see http://php.net/strlen
66: *
67: * @param string string being measured for length
68: *
69: * @return integer
70: */
71: public static function strlen($str)
72: {
73: return mb_strlen($str, 'utf-8');
74: }
75:
76: /**
77: * Finds position of first occurrence of a UTF-8 string.
78: * @see http://php.net/strlen
79: *
80: * @author Harry Fuecks <hfuecks@gmail.com>
81: *
82: * @param string haystack
83: * @param string needle
84: * @param integer offset from which character in haystack to start searching
85: *
86: * @return integer position of needle
87: * @return boolean FALSE if the needle is not found
88: */
89: public static function strpos($str, $search, $offset = 0)
90: {
91: return mb_strpos($str, $search, $offset, 'UTF-8');
92: }
93:
94: /**
95: * Finds position of last occurrence of a char in a UTF-8 string.
96: * @see http://php.net/strrpos
97: *
98: * @author Harry Fuecks <hfuecks@gmail.com>
99: *
100: * @param string haystack
101: * @param string needle
102: * @param integer offset from which character in haystack to start searching
103: *
104: * @return integer position of needle
105: * @return boolean FALSE if the needle is not found
106: */
107: public static function strrpos($str, $search, $offset = 0)
108: {
109: return mb_strrpos($str, $search, $offset, 'utf-8');
110: }
111:
112: /**
113: * Returns part of a UTF-8 string.
114: * @see http://php.net/substr
115: *
116: * @author Chris Smith <chris@jalakai.co.uk>
117: *
118: * @param string input string
119: * @param integer offset
120: * @param integer length limit
121: *
122: * @return string
123: */
124: public static function substr($str, $offset, $length = NULL)
125: {
126: return ($length === NULL) ? mb_substr($str, $offset, null, 'UTF-8') : mb_substr($str, $offset, $length, 'UTF-8');
127: }
128:
129: /**
130: * Replaces text within a portion of a UTF-8 string.
131: * @see http://php.net/substr_replace
132: *
133: * @author Harry Fuecks <hfuecks@gmail.com>
134: *
135: * @param string input string
136: * @param string replacement string
137: * @param integer offset
138: *
139: * @return string
140: */
141: public static function substr_replace($str, $replacement, $offset, $length = NULL)
142: {
143: return ($length === NULL) ? substr_replace($str, $replacement, $offset) : substr_replace($str, $replacement, $offset, $length);
144: }
145:
146: /**
147: * Makes a UTF-8 string lowercase.
148: * @see http://php.net/strtolower
149: *
150: * @author Andreas Gohr <andi@splitbrain.org>
151: *
152: * @param string mixed case string
153: *
154: * @return string
155: */
156: public static function strtolower($str)
157: {
158: return mb_strtolower($str, 'UTF-8');
159: }
160:
161: /**
162: * Makes a UTF-8 string uppercase.
163: * @see http://php.net/strtoupper
164: *
165: * @author Andreas Gohr <andi@splitbrain.org>
166: *
167: * @param string mixed case string
168: *
169: * @return string
170: */
171: public static function strtoupper($str)
172: {
173: return mb_strtoupper($str, 'UTF-8');
174: }
175:
176: /**
177: * Makes a UTF-8 string's first character uppercase.
178: * @see http://php.net/ucfirst
179: *
180: * @author Harry Fuecks <hfuecks@gmail.com>
181: *
182: * @param string mixed case string
183: *
184: * @return string
185: */
186: public static function ucfirst($str)
187: {
188: return mb_strtoupper(mb_substr($str, 0, 1, 'UTF-8'), 'UTF-8') . mb_substr($str, 1, mb_strlen($str, 'UTF-8'), 'UTF-8');
189: }
190:
191: /**
192: * Makes the first character of every word in a UTF-8 string uppercase.
193: * @see http://php.net/ucwords
194: *
195: * @author Harry Fuecks <hfuecks@gmail.com>
196: *
197: * @param string mixed case string
198: *
199: * @return string
200: */
201: public static function ucwords($str)
202: {
203: return mb_convert_case($str, MB_CASE_TITLE, 'UTF-8');
204: }
205:
206: /**
207: * Case-insensitive UTF-8 string comparison.
208: * @see http://php.net/strcasecmp
209: *
210: * @author Harry Fuecks <hfuecks@gmail.com>
211: *
212: * @param string string to compare
213: * @param string string to compare
214: *
215: * @return integer less than 0 if str1 is less than str2
216: * @return integer greater than 0 if str1 is greater than str2
217: * @return integer 0 if they are equal
218: */
219: public static function strcasecmp($str1, $str2)
220: {
221: $str1 = mb_strtolower($str1, 'UTF-8');
222: $str2 = mb_strtolower($str2, 'UTF-8');
223:
224: return strcmp($str1, $str2);
225: }
226:
227: /**
228: * Returns a string or an array with all occurrences of search in subject (ignoring case).
229: * replaced with the given replace value.
230: * @see http://php.net/str_ireplace
231: *
232: * @note It's not fast and gets slower if $search and/or $replace are arrays.
233: * @author Harry Fuecks <hfuecks@gmail.com
234: *
235: * @param string|array text to replace
236: * @param string|array replacement text
237: * @param string|array subject text
238: * @param integer number of matched and replaced needles will be returned via this parameter which is passed by reference
239: *
240: * @return string if the input was a string
241: * @return array if the input was an array
242: */
243: public static function str_ireplace($search, $replace, $str, & $count = NULL)
244: {
245: if (!is_array($search)) {
246:
247: $slen = strlen($search);
248: $lendif = strlen($replace) - $slen;
249: if ($slen == 0) {
250: return $str;
251: }
252:
253: $search = self::strtolower($search);
254:
255: $search = preg_quote($search, '/');
256: $lstr = self::strtolower($str);
257: $i = 0;
258: $matched = 0;
259: while (preg_match('/(.*)' . $search . '/Us', $lstr, $matches)) {
260: if ($i === $count) {
261: break;
262: }
263: $mlen = strlen($matches[0]);
264: $lstr = substr($lstr, $mlen);
265: $str = substr_replace($str, $replace, $matched + strlen($matches[1]), $slen);
266: $matched += $mlen + $lendif;
267: $i++;
268: }
269:
270: return $str;
271: } else {
272:
273: foreach (array_keys($search) as $k) {
274:
275: if (is_array($replace)) {
276:
277: if (array_key_exists($k, $replace)) {
278:
279: $str = self::str_ireplace($search[$k], $replace[$k], $str, $count);
280: } else {
281:
282: $str = self::str_ireplace($search[$k], '', $str, $count);
283: }
284: } else {
285:
286: $str = self::str_ireplace($search[$k], $replace, $str, $count);
287: }
288: }
289:
290: return $str;
291: }
292: }
293:
294: /**
295: * Case-insenstive UTF-8 version of strstr. Returns all of input string
296: * from the first occurrence of needle to the end.
297: * @see http://php.net/stristr
298: *
299: * @author Harry Fuecks <hfuecks@gmail.com>
300: *
301: * @param string input string
302: * @param string needle
303: *
304: * @return string matched substring if found
305: * @return boolean FALSE if the substring was not found
306: */
307: public static function stristr($str, $search)
308: {
309: if ($search == '') {
310: return $str;
311: }
312:
313: $str_lower = self::strtolower($str);
314: $search_lower = self::strtolower($search);
315:
316: preg_match('/^(.*?)' . preg_quote($search_lower, '/') . '/s', $str_lower, $matches);
317:
318: return isset($matches[1]) ? substr($str, strlen($matches[1])) : FALSE;
319: }
320:
321: /**
322: * Finds the length of the initial segment matching mask.
323: * @see http://php.net/strspn
324: *
325: * @author Harry Fuecks <hfuecks@gmail.com>
326: *
327: * @param string input string
328: * @param string mask for search
329: * @param integer start position of the string to examine
330: * @param integer length of the string to examine
331: *
332: * @return integer length of the initial segment that contains characters in the mask
333: */
334: public static function strspn($str, $mask, $offset = NULL, $length = NULL)
335: {
336: if ($str == '' OR $mask == '') {
337: return 0;
338: }
339:
340: if ($offset !== NULL OR $length !== NULL) {
341: $str = self::substr($str, $offset, $length);
342: }
343:
344: // Escape these characters: - [ ] . : \ ^ /
345: // The . and : are escaped to prevent possible warnings about POSIX regex elements
346: $mask = preg_replace('#[-[\].:\\\\^/]#', '\\\\$0', $mask);
347: preg_match('/^[^' . $mask . ']+/u', $str, $matches);
348:
349: return isset($matches[0]) ? self::strlen($matches[0]) : 0;
350: }
351:
352: /**
353: * Finds the length of the initial segment not matching mask.
354: * @see http://php.net/strcspn
355: *
356: * @author Harry Fuecks <hfuecks@gmail.com>
357: *
358: * @param string input string
359: * @param string mask for search
360: * @param integer start position of the string to examine
361: * @param integer length of the string to examine
362: *
363: * @return integer length of the initial segment that contains characters not in the mask
364: */
365: public static function strcspn($str, $mask, $offset = NULL, $length = NULL)
366: {
367: if ($str == '' OR $mask == '') {
368: return 0;
369: }
370:
371: if ($str !== NULL OR $length !== NULL) {
372: $str = self::substr($str, $offset, $length);
373: }
374:
375: // Escape these characters: - [ ] . : \ ^ /
376: // The . and : are escaped to prevent possible warnings about POSIX regex elements
377: $mask = preg_replace('#[-[\].:\\\\^/]#', '\\\\$0', $mask);
378: preg_match('/^[^' . $mask . ']+/u', $str, $matches);
379:
380: return isset($matches[0]) ? self::strlen($matches[0]) : 0;
381: }
382:
383: /**
384: * Pads a UTF-8 string to a certain length with another string.
385: * @see http://php.net/str_pad
386: *
387: * @author Harry Fuecks <hfuecks@gmail.com>
388: *
389: * @param string input string
390: * @param integer desired string length after padding
391: * @param string string to use as padding
392: * @param string padding type: STR_PAD_RIGHT, STR_PAD_LEFT, or STR_PAD_BOTH
393: *
394: * @return string
395: */
396: public static function str_pad($str, $final_str_length, $pad_str = ' ', $pad_type = STR_PAD_RIGHT)
397: {
398: $str_length = self::strlen($str);
399:
400: if ($final_str_length <= 0 OR $final_str_length <= $str_length) {
401: return $str;
402: }
403:
404: $pad_str_length = self::strlen($pad_str);
405: $pad_length = $final_str_length - $str_length;
406:
407: if ($pad_type == STR_PAD_RIGHT) {
408: $repeat = ceil($pad_length / $pad_str_length);
409:
410: return self::substr($str . str_repeat($pad_str, $repeat), 0, $final_str_length);
411: }
412:
413: if ($pad_type == STR_PAD_LEFT) {
414: $repeat = ceil($pad_length / $pad_str_length);
415:
416: return self::substr(str_repeat($pad_str, $repeat), 0, floor($pad_length)) . $str;
417: }
418:
419: if ($pad_type == STR_PAD_BOTH) {
420: $pad_length /= 2;
421: $pad_length_left = floor($pad_length);
422: $pad_length_right = ceil($pad_length);
423: $repeat_left = ceil($pad_length_left / $pad_str_length);
424: $repeat_right = ceil($pad_length_right / $pad_str_length);
425:
426: $pad_left = self::substr(str_repeat($pad_str, $repeat_left), 0, $pad_length_left);
427: $pad_right = self::substr(str_repeat($pad_str, $repeat_right), 0, $pad_length_left);
428:
429: return $pad_left . $str . $pad_right;
430: }
431: }
432:
433: /**
434: * Converts a UTF-8 string to an array.
435: * @see http://php.net/str_split
436: *
437: * @author Harry Fuecks <hfuecks@gmail.com>
438: *
439: * @param string input string
440: * @param integer maximum length of each chunk
441: *
442: * @return array
443: */
444: public static function str_split($str, $split_length = 1)
445: {
446: $split_length = (int) $split_length;
447:
448: if ($split_length < 1) {
449: return FALSE;
450: }
451:
452: if (self::strlen($str) <= $split_length) {
453: return array($str);
454: }
455:
456: preg_match_all('/.{' . $split_length . '}|[^\x00]{1,' . $split_length . '}$/us', $str, $matches);
457:
458: return $matches[0];
459: }
460:
461: /**
462: * Reverses a UTF-8 string.
463: * @see http://php.net/strrev
464: *
465: * @author Harry Fuecks <hfuecks@gmail.com>
466: *
467: * @param string string to be reversed
468: *
469: * @return string
470: */
471: public static function strrev($str)
472: {
473: preg_match_all('/./us', $str, $matches);
474:
475: return implode('', array_reverse($matches[0]));
476: }
477:
478: /**
479: * Strips whitespace (or other UTF-8 characters) from the beginning and
480: * end of a string.
481: * @see http://php.net/trim
482: *
483: * @author Andreas Gohr <andi@splitbrain.org>
484: *
485: * @param string input string
486: * @param string string of characters to remove
487: *
488: * @return string
489: */
490: public static function trim($str, $charlist = NULL)
491: {
492: if ($charlist === NULL) {
493: return trim($str);
494: }
495:
496: return self::ltrim(self::rtrim($str, $charlist), $charlist);
497: }
498:
499: /**
500: * Strips whitespace (or other UTF-8 characters) from the beginning of a string.
501: * @see http://php.net/ltrim
502: *
503: * @author Andreas Gohr <andi@splitbrain.org>
504: *
505: * @param string input string
506: * @param string string of characters to remove
507: *
508: * @return string
509: */
510: public static function ltrim($str, $charlist = NULL)
511: {
512: if ($charlist === NULL) {
513: return ltrim($str);
514: }
515:
516: $charlist = preg_replace('#[-\[\]:\\\\^/]#', '\\\\$0', $charlist);
517:
518: return preg_replace('/^[' . $charlist . ']+/u', '', $str);
519: }
520:
521: /**
522: * Strips whitespace (or other UTF-8 characters) from the end of a string.
523: * @see http://php.net/rtrim
524: *
525: * @author Andreas Gohr <andi@splitbrain.org>
526: *
527: * @param string input string
528: * @param string string of characters to remove
529: *
530: * @return string
531: */
532: public static function rtrim($str, $charlist = NULL)
533: {
534: if ($charlist === NULL) {
535: return rtrim($str);
536: }
537:
538: $charlist = preg_replace('#[-\[\]:\\\\^/]#', '\\\\$0', $charlist);
539:
540: return preg_replace('/[' . $charlist . ']++$/uD', '', $str);
541: }
542:
543: /**
544: * Returns the unicode ordinal for a character.
545: * @see http://php.net/ord
546: *
547: * @author Harry Fuecks <hfuecks@gmail.com>
548: *
549: * @param string UTF-8 encoded character
550: *
551: * @return integer
552: */
553: public static function ord($chr)
554: {
555: $ord0 = ord($chr);
556:
557: if ($ord0 >= 0 AND $ord0 <= 127) {
558: return $ord0;
559: }
560:
561: if (!isset($chr[1])) {
562: trigger_error('Short sequence - at least 2 bytes expected, only 1 seen', E_USER_WARNING);
563:
564: return FALSE;
565: }
566:
567: $ord1 = ord($chr[1]);
568:
569: if ($ord0 >= 192 AND $ord0 <= 223) {
570: return ($ord0 - 192) * 64 + ($ord1 - 128);
571: }
572:
573: if (!isset($chr[2])) {
574: trigger_error('Short sequence - at least 3 bytes expected, only 2 seen', E_USER_WARNING);
575:
576: return FALSE;
577: }
578:
579: $ord2 = ord($chr[2]);
580:
581: if ($ord0 >= 224 AND $ord0 <= 239) {
582: return ($ord0 - 224) * 4096 + ($ord1 - 128) * 64 + ($ord2 - 128);
583: }
584:
585: if (!isset($chr[3])) {
586: trigger_error('Short sequence - at least 4 bytes expected, only 3 seen', E_USER_WARNING);
587:
588: return FALSE;
589: }
590:
591: $ord3 = ord($chr[3]);
592:
593: if ($ord0 >= 240 AND $ord0 <= 247) {
594: return ($ord0 - 240) * 262144 + ($ord1 - 128) * 4096 + ($ord2 - 128) * 64 + ($ord3 - 128);
595: }
596:
597: if (!isset($chr[4])) {
598: trigger_error('Short sequence - at least 5 bytes expected, only 4 seen', E_USER_WARNING);
599:
600: return FALSE;
601: }
602:
603: $ord4 = ord($chr[4]);
604:
605: if ($ord0 >= 248 AND $ord0 <= 251) {
606: return ($ord0 - 248) * 16777216 + ($ord1 - 128) * 262144 + ($ord2 - 128) * 4096 + ($ord3 - 128) * 64 + ($ord4 - 128);
607: }
608:
609: if (!isset($chr[5])) {
610: trigger_error('Short sequence - at least 6 bytes expected, only 5 seen', E_USER_WARNING);
611:
612: return FALSE;
613: }
614:
615: if ($ord0 >= 252 AND $ord0 <= 253) {
616: return ($ord0 - 252) * 1073741824 + ($ord1 - 128) * 16777216 + ($ord2 - 128) * 262144 + ($ord3 - 128) * 4096 + ($ord4 - 128) * 64 + (ord($chr[5]) - 128);
617: }
618:
619: if ($ord0 >= 254 AND $ord0 <= 255) {
620: trigger_error('Invalid UTF-8 with surrogate ordinal ' . $ord0, E_USER_WARNING);
621:
622: return FALSE;
623: }
624: }
625:
626: /**
627: * Takes an UTF-8 string and returns an array of ints representing the Unicode characters.
628: * Astral planes are supported i.e. the ints in the output can be > 0xFFFF.
629: * Occurrances of the BOM are ignored. Surrogates are not allowed.
630: *
631: * The Original Code is Mozilla Communicator client code.
632: * The Initial Developer of the Original Code is Netscape Communications Corporation.
633: * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
634: * Ported to PHP by Henri Sivonen <hsivonen@iki .fi>, see http://hsivonen.iki.fi/php-utf8/.
635: * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
636: *
637: * @param string UTF-8 encoded string
638: *
639: * @return array unicode code points
640: * @return boolean FALSE if the string is invalid
641: */
642: public static function to_unicode($str)
643: {
644: $mState = 0; // cached expected number of octets after the current octet until the beginning of the next UTF8 character sequence
645: $mUcs4 = 0; // cached Unicode character
646: $mBytes = 1; // cached expected number of octets in the current sequence
647:
648: $out = array();
649:
650: $len = strlen($str);
651:
652: for ($i = 0; $i < $len; $i++) {
653: $in = ord($str[$i]);
654:
655: if ($mState == 0) {
656: // When mState is zero we expect either a US-ASCII character or a
657: // multi-octet sequence.
658: if (0 == (0x80 & $in)) {
659: // US-ASCII, pass straight through.
660: $out[] = $in;
661: $mBytes = 1;
662: } elseif (0xC0 == (0xE0 & $in)) {
663: // First octet of 2 octet sequence
664: $mUcs4 = $in;
665: $mUcs4 = ($mUcs4 & 0x1F) << 6;
666: $mState = 1;
667: $mBytes = 2;
668: } elseif (0xE0 == (0xF0 & $in)) {
669: // First octet of 3 octet sequence
670: $mUcs4 = $in;
671: $mUcs4 = ($mUcs4 & 0x0F) << 12;
672: $mState = 2;
673: $mBytes = 3;
674: } elseif (0xF0 == (0xF8 & $in)) {
675: // First octet of 4 octet sequence
676: $mUcs4 = $in;
677: $mUcs4 = ($mUcs4 & 0x07) << 18;
678: $mState = 3;
679: $mBytes = 4;
680: } elseif (0xF8 == (0xFC & $in)) {
681: // First octet of 5 octet sequence.
682: //
683: // This is illegal because the encoded codepoint must be either
684: // (a) not the shortest form or
685: // (b) outside the Unicode range of 0-0x10FFFF.
686: // Rather than trying to resynchronize, we will carry on until the end
687: // of the sequence and let the later error handling code catch it.
688: $mUcs4 = $in;
689: $mUcs4 = ($mUcs4 & 0x03) << 24;
690: $mState = 4;
691: $mBytes = 5;
692: } elseif (0xFC == (0xFE & $in)) {
693: // First octet of 6 octet sequence, see comments for 5 octet sequence.
694: $mUcs4 = $in;
695: $mUcs4 = ($mUcs4 & 1) << 30;
696: $mState = 5;
697: $mBytes = 6;
698: } else {
699: // Current octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence.
700: trigger_error('utf8::to_unicode: Illegal sequence identifier in UTF-8 at byte ' . $i, E_USER_WARNING);
701:
702: return FALSE;
703: }
704: } else {
705: // When mState is non-zero, we expect a continuation of the multi-octet sequence
706: if (0x80 == (0xC0 & $in)) {
707: // Legal continuation
708: $shift = ($mState - 1) * 6;
709: $tmp = $in;
710: $tmp = ($tmp & 0x0000003F) << $shift;
711: $mUcs4 |= $tmp;
712:
713: // End of the multi-octet sequence. mUcs4 now contains the final Unicode codepoint to be output
714: if (0 == --$mState) {
715: // Check for illegal sequences and codepoints
716: // From Unicode 3.1, non-shortest form is illegal
717: if (((2 == $mBytes) AND ($mUcs4 < 0x0080)) OR
718: ((3 == $mBytes) AND ($mUcs4 < 0x0800)) OR
719: ((4 == $mBytes) AND ($mUcs4 < 0x10000)) OR
720: (4 < $mBytes) OR
721: // From Unicode 3.2, surrogate characters are illegal
722: (($mUcs4 & 0xFFFFF800) == 0xD800) OR
723: // Codepoints outside the Unicode range are illegal
724: ($mUcs4 > 0x10FFFF)
725: ) {
726: trigger_error('utf8::to_unicode: Illegal sequence or codepoint in UTF-8 at byte ' . $i, E_USER_WARNING);
727:
728: return FALSE;
729: }
730:
731: if (0xFEFF != $mUcs4) {
732: // BOM is legal but we don't want to output it
733: $out[] = $mUcs4;
734: }
735:
736: // Initialize UTF-8 cache
737: $mState = 0;
738: $mUcs4 = 0;
739: $mBytes = 1;
740: }
741: } else {
742: // ((0xC0 & (*in) != 0x80) AND (mState != 0))
743: // Incomplete multi-octet sequence
744: trigger_error('joosString::to_unicode: Incomplete multi-octet sequence in UTF-8 at byte ' . $i, E_USER_WARNING);
745:
746: return FALSE;
747: }
748: }
749: }
750:
751: return $out;
752: }
753:
754: /**
755: * Takes an array of ints representing the Unicode characters and returns a UTF-8 string.
756: * Astral planes are supported i.e. the ints in the input can be > 0xFFFF.
757: * Occurrances of the BOM are ignored. Surrogates are not allowed.
758: *
759: * The Original Code is Mozilla Communicator client code.
760: * The Initial Developer of the Original Code is Netscape Communications Corporation.
761: * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
762: * Ported to PHP by Henri Sivonen <hsivonen@iki .fi>, see http://hsivonen.iki.fi/php-utf8/.
763: * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
764: *
765: * @param array unicode code points representing a string
766: *
767: * @return string utf8 string of characters
768: * @return boolean FALSE if a code point cannot be found
769: */
770: public static function from_unicode($arr)
771: {
772: ob_start();
773:
774: $keys = array_keys($arr);
775:
776: foreach ($keys as $k) {
777: // ASCII range (including control chars)
778: if (($arr[$k] >= 0) AND ($arr[$k] <= 0x007f)) {
779: echo chr($arr[$k]);
780: } // 2 byte sequence
781: elseif ($arr[$k] <= 0x07ff) {
782: echo chr(0xc0 | ($arr[$k] >> 6));
783: echo chr(0x80 | ($arr[$k] & 0x003f));
784: } // Byte order mark (skip)
785: elseif ($arr[$k] == 0xFEFF) {
786: // nop -- zap the BOM
787: } // Test for illegal surrogates
788: elseif ($arr[$k] >= 0xD800 AND $arr[$k] <= 0xDFFF) {
789: // Found a surrogate
790: trigger_error('utf8::from_unicode: Illegal surrogate at index: ' . $k . ', value: ' . $arr[$k], E_USER_WARNING);
791:
792: return FALSE;
793: } // 3 byte sequence
794: elseif ($arr[$k] <= 0xffff) {
795: echo chr(0xe0 | ($arr[$k] >> 12));
796: echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
797: echo chr(0x80 | ($arr[$k] & 0x003f));
798: } // 4 byte sequence
799: elseif ($arr[$k] <= 0x10ffff) {
800: echo chr(0xf0 | ($arr[$k] >> 18));
801: echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
802: echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
803: echo chr(0x80 | ($arr[$k] & 0x3f));
804: } // Out of range
805: else {
806: trigger_error('utf8::from_unicode: Codepoint out of Unicode range at index: ' . $k . ', value: ' . $arr[$k], E_USER_WARNING);
807:
808: return FALSE;
809: }
810: }
811:
812: return ob_get_clean();
813: }
814:
815: public static function to_utf8(&$text)
816: {
817: if (is_array($text) OR is_object($text)) {
818: $d = array();
819: foreach ($text as $k => &$v) {
820: $d[self::to_utf8($k)] = self::to_utf8($v);
821: }
822:
823: return $d;
824: }
825: if (is_string($text)) {
826: if (self::is_utf8($text)) { // если это юникод - сразу его возвращаем
827:
828: return $text;
829: }
830: if (function_exists('iconv')) { // пробуем конвертировать через iconv
831:
832: return iconv('cp1251', 'utf-8//IGNORE//TRANSLIT', $text);
833: }
834:
835: throw new joosException('Перекодировка не поддерживается');
836: }
837:
838: return $text;
839: }
840:
841: /* проверка на юникод */
842: public static function is_utf8(&$data, $is_strict = true)
843: {
844: if (is_array($data)) { // массив
845: foreach ($data as &$v) {
846: if (!self::is_utf8($v, $is_strict)) {
847: return false;
848: }
849: }
850:
851: return true;
852: } elseif (is_string($data)) { // строка
853: if (function_exists('iconv')) {
854: $distance = strlen($data) - strlen(iconv('UTF-8', 'UTF-8//IGNORE', $data));
855: if ($distance > 0) {
856: return false;
857: }
858: if ($is_strict && preg_match('/[^\x09\x0A\x0D\x20-\xFF]/sS', $data)) {
859: return false;
860: }
861:
862: return true;
863: }
864:
865: return self::utf8_check($data, $is_strict);
866: } elseif (is_scalar($data) || $data===null) { //числа, булево и ничего
867:
868: return true;
869: }
870:
871: return false;
872: }
873:
874: /* проверка на юникод */
875: public static function utf8_check($str, $is_strict = true)
876: {
877: for ($i = 0, $len = strlen($str); $i < $len; $i++) {
878: $c = ord($str[$i]);
879: if ($c < 0x80) {
880: if ($is_strict === false || ($c > 0x1F && $c < 0x7F) || $c == 0x09 || $c == 0x0A || $c == 0x0D) {
881: continue;
882: }
883: }
884: if (($c & 0xE0) == 0xC0) {
885: $n = 1;
886: } elseif (($c & 0xF0) == 0xE0) {
887: $n = 2;
888: } elseif (($c & 0xF8) == 0xF0) {
889: $n = 3;
890: } elseif (($c & 0xFC) == 0xF8) {
891: $n = 4;
892: } elseif (($c & 0xFE) == 0xFC) {
893: $n = 5;
894: } else {
895: return false;
896: }
897: for ($j = 0; $j < $n; $j++) {
898: $i++;
899: if ($i == $len || ((ord($str[$i]) & 0xC0) != 0x80)) {
900: return false;
901: }
902: }
903: }
904:
905: return true;
906: }
907:
908: }
909: