File core/libraries/string.php | Joostina CMS / CMF v2.* API

  1: <?php defined('_JOOS_CORE') or exit();
  2: 
  3: /**
  4:  * Библиотека работы со строками в правильной кодировке UTF-8
  5:  *
  6:  * @version    1.0
  7:  * @package    Core\Libraries
  8:  * @subpackage String
  9:  * @category   Libraries
 10:  * @author     Joostina Team <info@joostina.ru>
 11:  * @copyright  (C) 2007-2012 Joostina Team
 12:  * @license    MIT License http://www.opensource.org/licenses/mit-license.php
 13:  * Информация об авторах и лицензиях стороннего кода в составе Joostina CMS: docs/copyrights
 14:  *
 15:  * @author     Kohana Team
 16:  * @copyright  (c) 2007 Kohana Team
 17:  * @copyright  (c) 2005 Harry Fuecks
 18:  * @license    http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt
 19:  *
 20:  * */
 21: class joosString
 22: {
 23:     // Called methods
 24:     public static $called = array();
 25: 
 26:     /**
 27:      * Tests whether a string contains only 7bit ASCII bytes. This is used to
 28:      * determine when to use native functions or UTF-8 functions.
 29:      *
 30:      * @param   string  string to check
 31:      *
 32:      * @return bool
 33:      */
 34:     public static function is_ascii($str)
 35:     {
 36:         return !preg_match('/[^\x00-\x7F]/S', $str);
 37:     }
 38: 
 39:     /**
 40:      * Strips out device control codes in the ASCII range.
 41:      *
 42:      * @param   string  string to clean
 43:      *
 44:      * @return string
 45:      */
 46:     public static function strip_ascii_ctrl($str)
 47:     {
 48:         return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S', '', $str);
 49:     }
 50: 
 51:     /**
 52:      * Strips out all non-7bit ASCII bytes.
 53:      *
 54:      * @param   string  string to clean
 55:      *
 56:      * @return string
 57:      */
 58:     public static function strip_non_ascii($str)
 59:     {
 60:         return preg_replace('/[^\x00-\x7F]+/S', '', $str);
 61:     }
 62: 
 63:     /**
 64:      * Returns the length of the given string.
 65:      * @see http://php.net/strlen
 66:      *
 67:      * @param   string   string being measured for length
 68:      *
 69:      * @return integer
 70:      */
 71:     public static function strlen($str)
 72:     {
 73:         return mb_strlen($str, 'utf-8');
 74:     }
 75: 
 76:     /**
 77:      * Finds position of first occurrence of a UTF-8 string.
 78:      * @see     http://php.net/strlen
 79:      *
 80:      * @author  Harry Fuecks <hfuecks@gmail.com>
 81:      *
 82:      * @param   string   haystack
 83:      * @param   string   needle
 84:      * @param   integer  offset from which character in haystack to start searching
 85:      *
 86:      * @return integer position of needle
 87:      * @return boolean FALSE if the needle is not found
 88:      */
 89:     public static function strpos($str, $search, $offset = 0)
 90:     {
 91:         return mb_strpos($str, $search, $offset, 'UTF-8');
 92:     }
 93: 
 94:     /**
 95:      * Finds position of last occurrence of a char in a UTF-8 string.
 96:      * @see     http://php.net/strrpos
 97:      *
 98:      * @author  Harry Fuecks <hfuecks@gmail.com>
 99:      *
100:      * @param   string   haystack
101:      * @param   string   needle
102:      * @param   integer  offset from which character in haystack to start searching
103:      *
104:      * @return integer position of needle
105:      * @return boolean FALSE if the needle is not found
106:      */
107:     public static function strrpos($str, $search, $offset = 0)
108:     {
109:         return mb_strrpos($str, $search, $offset, 'utf-8');
110:     }
111: 
112:     /**
113:      * Returns part of a UTF-8 string.
114:      * @see     http://php.net/substr
115:      *
116:      * @author  Chris Smith <chris@jalakai.co.uk>
117:      *
118:      * @param   string   input string
119:      * @param   integer  offset
120:      * @param   integer  length limit
121:      *
122:      * @return string
123:      */
124:     public static function substr($str, $offset, $length = NULL)
125:     {
126:         return ($length === NULL) ? mb_substr($str, $offset, null, 'UTF-8') : mb_substr($str, $offset, $length, 'UTF-8');
127:     }
128: 
129:     /**
130:      * Replaces text within a portion of a UTF-8 string.
131:      * @see     http://php.net/substr_replace
132:      *
133:      * @author  Harry Fuecks <hfuecks@gmail.com>
134:      *
135:      * @param   string   input string
136:      * @param   string   replacement string
137:      * @param   integer  offset
138:      *
139:      * @return string
140:      */
141:     public static function substr_replace($str, $replacement, $offset, $length = NULL)
142:     {
143:         return ($length === NULL) ? substr_replace($str, $replacement, $offset) : substr_replace($str, $replacement, $offset, $length);
144:     }
145: 
146:     /**
147:      * Makes a UTF-8 string lowercase.
148:      * @see     http://php.net/strtolower
149:      *
150:      * @author  Andreas Gohr <andi@splitbrain.org>
151:      *
152:      * @param   string   mixed case string
153:      *
154:      * @return string
155:      */
156:     public static function strtolower($str)
157:     {
158:         return mb_strtolower($str, 'UTF-8');
159:     }
160: 
161:     /**
162:      * Makes a UTF-8 string uppercase.
163:      * @see     http://php.net/strtoupper
164:      *
165:      * @author  Andreas Gohr <andi@splitbrain.org>
166:      *
167:      * @param   string   mixed case string
168:      *
169:      * @return string
170:      */
171:     public static function strtoupper($str)
172:     {
173:         return mb_strtoupper($str, 'UTF-8');
174:     }
175: 
176:     /**
177:      * Makes a UTF-8 string's first character uppercase.
178:      * @see     http://php.net/ucfirst
179:      *
180:      * @author  Harry Fuecks <hfuecks@gmail.com>
181:      *
182:      * @param   string   mixed case string
183:      *
184:      * @return string
185:      */
186:     public static function ucfirst($str)
187:     {
188:         return mb_strtoupper(mb_substr($str, 0, 1, 'UTF-8'), 'UTF-8') . mb_substr($str, 1, mb_strlen($str, 'UTF-8'), 'UTF-8');
189:     }
190: 
191:     /**
192:      * Makes the first character of every word in a UTF-8 string uppercase.
193:      * @see     http://php.net/ucwords
194:      *
195:      * @author  Harry Fuecks <hfuecks@gmail.com>
196:      *
197:      * @param   string   mixed case string
198:      *
199:      * @return string
200:      */
201:     public static function ucwords($str)
202:     {
203:         return mb_convert_case($str, MB_CASE_TITLE, 'UTF-8');
204:     }
205: 
206:     /**
207:      * Case-insensitive UTF-8 string comparison.
208:      * @see     http://php.net/strcasecmp
209:      *
210:      * @author  Harry Fuecks <hfuecks@gmail.com>
211:      *
212:      * @param   string   string to compare
213:      * @param   string   string to compare
214:      *
215:      * @return integer less than 0 if str1 is less than str2
216:      * @return integer greater than 0 if str1 is greater than str2
217:      * @return integer 0 if they are equal
218:      */
219:     public static function strcasecmp($str1, $str2)
220:     {
221:         $str1 = mb_strtolower($str1, 'UTF-8');
222:         $str2 = mb_strtolower($str2, 'UTF-8');
223: 
224:         return strcmp($str1, $str2);
225:     }
226: 
227:     /**
228:      * Returns a string or an array with all occurrences of search in subject (ignoring case).
229:      * replaced with the given replace value.
230:      * @see     http://php.net/str_ireplace
231:      *
232:      * @note    It's not fast and gets slower if $search and/or $replace are arrays.
233:      * @author  Harry Fuecks <hfuecks@gmail.com
234:      *
235:      * @param   string|array  text          to replace
236:      * @param   string|array  replacement   text
237:      * @param   string|array  subject       text
238:      * @param                 integer       number of matched and replaced needles will be returned via this parameter which is passed by reference
239:      *
240:      * @return string if the input was a string
241:      * @return array  if the input was an array
242:      */
243:     public static function str_ireplace($search, $replace, $str, & $count = NULL)
244:     {
245:         if (!is_array($search)) {
246: 
247:             $slen = strlen($search);
248:             $lendif = strlen($replace) - $slen;
249:             if ($slen == 0) {
250:                 return $str;
251:             }
252: 
253:             $search = self::strtolower($search);
254: 
255:             $search = preg_quote($search, '/');
256:             $lstr = self::strtolower($str);
257:             $i = 0;
258:             $matched = 0;
259:             while (preg_match('/(.*)' . $search . '/Us', $lstr, $matches)) {
260:                 if ($i === $count) {
261:                     break;
262:                 }
263:                 $mlen = strlen($matches[0]);
264:                 $lstr = substr($lstr, $mlen);
265:                 $str = substr_replace($str, $replace, $matched + strlen($matches[1]), $slen);
266:                 $matched += $mlen + $lendif;
267:                 $i++;
268:             }
269: 
270:             return $str;
271:         } else {
272: 
273:             foreach (array_keys($search) as $k) {
274: 
275:                 if (is_array($replace)) {
276: 
277:                     if (array_key_exists($k, $replace)) {
278: 
279:                         $str = self::str_ireplace($search[$k], $replace[$k], $str, $count);
280:                     } else {
281: 
282:                         $str = self::str_ireplace($search[$k], '', $str, $count);
283:                     }
284:                 } else {
285: 
286:                     $str = self::str_ireplace($search[$k], $replace, $str, $count);
287:                 }
288:             }
289: 
290:             return $str;
291:         }
292:     }
293: 
294:     /**
295:      * Case-insenstive UTF-8 version of strstr. Returns all of input string
296:      * from the first occurrence of needle to the end.
297:      * @see    http://php.net/stristr
298:      *
299:      * @author Harry Fuecks <hfuecks@gmail.com>
300:      *
301:      * @param   string   input string
302:      * @param   string   needle
303:      *
304:      * @return string  matched substring if found
305:      * @return boolean FALSE if the substring was not found
306:      */
307:     public static function stristr($str, $search)
308:     {
309:         if ($search == '') {
310:             return $str;
311:         }
312: 
313:         $str_lower = self::strtolower($str);
314:         $search_lower = self::strtolower($search);
315: 
316:         preg_match('/^(.*?)' . preg_quote($search_lower, '/') . '/s', $str_lower, $matches);
317: 
318:         return isset($matches[1]) ? substr($str, strlen($matches[1])) : FALSE;
319:     }
320: 
321:     /**
322:      * Finds the length of the initial segment matching mask.
323:      * @see    http://php.net/strspn
324:      *
325:      * @author Harry Fuecks <hfuecks@gmail.com>
326:      *
327:      * @param   string   input string
328:      * @param   string   mask for search
329:      * @param   integer  start position of the string to examine
330:      * @param   integer  length of the string to examine
331:      *
332:      * @return integer length of the initial segment that contains characters in the mask
333:      */
334:     public static function strspn($str, $mask, $offset = NULL, $length = NULL)
335:     {
336:         if ($str == '' OR $mask == '') {
337:             return 0;
338:         }
339: 
340:         if ($offset !== NULL OR $length !== NULL) {
341:             $str = self::substr($str, $offset, $length);
342:         }
343: 
344:         // Escape these characters:  - [ ] . : \ ^ /
345:         // The . and : are escaped to prevent possible warnings about POSIX regex elements
346:         $mask = preg_replace('#[-[\].:\\\\^/]#', '\\\\$0', $mask);
347:         preg_match('/^[^' . $mask . ']+/u', $str, $matches);
348: 
349:         return isset($matches[0]) ? self::strlen($matches[0]) : 0;
350:     }
351: 
352:     /**
353:      * Finds the length of the initial segment not matching mask.
354:      * @see     http://php.net/strcspn
355:      *
356:      * @author  Harry Fuecks <hfuecks@gmail.com>
357:      *
358:      * @param   string   input string
359:      * @param   string   mask for search
360:      * @param   integer  start position of the string to examine
361:      * @param   integer  length of the string to examine
362:      *
363:      * @return integer length of the initial segment that contains characters not in the mask
364:      */
365:     public static function strcspn($str, $mask, $offset = NULL, $length = NULL)
366:     {
367:         if ($str == '' OR $mask == '') {
368:             return 0;
369:         }
370: 
371:         if ($str !== NULL OR $length !== NULL) {
372:             $str = self::substr($str, $offset, $length);
373:         }
374: 
375:         // Escape these characters:  - [ ] . : \ ^ /
376:         // The . and : are escaped to prevent possible warnings about POSIX regex elements
377:         $mask = preg_replace('#[-[\].:\\\\^/]#', '\\\\$0', $mask);
378:         preg_match('/^[^' . $mask . ']+/u', $str, $matches);
379: 
380:         return isset($matches[0]) ? self::strlen($matches[0]) : 0;
381:     }
382: 
383:     /**
384:      * Pads a UTF-8 string to a certain length with another string.
385:      * @see     http://php.net/str_pad
386:      *
387:      * @author  Harry Fuecks <hfuecks@gmail.com>
388:      *
389:      * @param   string   input string
390:      * @param   integer  desired string length after padding
391:      * @param   string   string to use as padding
392:      * @param   string   padding type: STR_PAD_RIGHT, STR_PAD_LEFT, or STR_PAD_BOTH
393:      *
394:      * @return string
395:      */
396:     public static function str_pad($str, $final_str_length, $pad_str = ' ', $pad_type = STR_PAD_RIGHT)
397:     {
398:         $str_length = self::strlen($str);
399: 
400:         if ($final_str_length <= 0 OR $final_str_length <= $str_length) {
401:             return $str;
402:         }
403: 
404:         $pad_str_length = self::strlen($pad_str);
405:         $pad_length = $final_str_length - $str_length;
406: 
407:         if ($pad_type == STR_PAD_RIGHT) {
408:             $repeat = ceil($pad_length / $pad_str_length);
409: 
410:             return self::substr($str . str_repeat($pad_str, $repeat), 0, $final_str_length);
411:         }
412: 
413:         if ($pad_type == STR_PAD_LEFT) {
414:             $repeat = ceil($pad_length / $pad_str_length);
415: 
416:             return self::substr(str_repeat($pad_str, $repeat), 0, floor($pad_length)) . $str;
417:         }
418: 
419:         if ($pad_type == STR_PAD_BOTH) {
420:             $pad_length /= 2;
421:             $pad_length_left = floor($pad_length);
422:             $pad_length_right = ceil($pad_length);
423:             $repeat_left = ceil($pad_length_left / $pad_str_length);
424:             $repeat_right = ceil($pad_length_right / $pad_str_length);
425: 
426:             $pad_left = self::substr(str_repeat($pad_str, $repeat_left), 0, $pad_length_left);
427:             $pad_right = self::substr(str_repeat($pad_str, $repeat_right), 0, $pad_length_left);
428: 
429:             return $pad_left . $str . $pad_right;
430:         }
431:     }
432: 
433:     /**
434:      * Converts a UTF-8 string to an array.
435:      * @see     http://php.net/str_split
436:      *
437:      * @author  Harry Fuecks <hfuecks@gmail.com>
438:      *
439:      * @param   string   input string
440:      * @param   integer  maximum length of each chunk
441:      *
442:      * @return array
443:      */
444:     public static function str_split($str, $split_length = 1)
445:     {
446:         $split_length = (int) $split_length;
447: 
448:         if ($split_length < 1) {
449:             return FALSE;
450:         }
451: 
452:         if (self::strlen($str) <= $split_length) {
453:             return array($str);
454:         }
455: 
456:         preg_match_all('/.{' . $split_length . '}|[^\x00]{1,' . $split_length . '}$/us', $str, $matches);
457: 
458:         return $matches[0];
459:     }
460: 
461:     /**
462:      * Reverses a UTF-8 string.
463:      * @see     http://php.net/strrev
464:      *
465:      * @author  Harry Fuecks <hfuecks@gmail.com>
466:      *
467:      * @param   string   string to be reversed
468:      *
469:      * @return string
470:      */
471:     public static function strrev($str)
472:     {
473:         preg_match_all('/./us', $str, $matches);
474: 
475:         return implode('', array_reverse($matches[0]));
476:     }
477: 
478:     /**
479:      * Strips whitespace (or other UTF-8 characters) from the beginning and
480:      * end of a string.
481:      * @see     http://php.net/trim
482:      *
483:      * @author  Andreas Gohr <andi@splitbrain.org>
484:      *
485:      * @param   string   input string
486:      * @param   string   string of characters to remove
487:      *
488:      * @return string
489:      */
490:     public static function trim($str, $charlist = NULL)
491:     {
492:         if ($charlist === NULL) {
493:             return trim($str);
494:         }
495: 
496:         return self::ltrim(self::rtrim($str, $charlist), $charlist);
497:     }
498: 
499:     /**
500:      * Strips whitespace (or other UTF-8 characters) from the beginning of a string.
501:      * @see     http://php.net/ltrim
502:      *
503:      * @author  Andreas Gohr <andi@splitbrain.org>
504:      *
505:      * @param   string   input string
506:      * @param   string   string of characters to remove
507:      *
508:      * @return string
509:      */
510:     public static function ltrim($str, $charlist = NULL)
511:     {
512:         if ($charlist === NULL) {
513:             return ltrim($str);
514:         }
515: 
516:         $charlist = preg_replace('#[-\[\]:\\\\^/]#', '\\\\$0', $charlist);
517: 
518:         return preg_replace('/^[' . $charlist . ']+/u', '', $str);
519:     }
520: 
521:     /**
522:      * Strips whitespace (or other UTF-8 characters) from the end of a string.
523:      * @see     http://php.net/rtrim
524:      *
525:      * @author  Andreas Gohr <andi@splitbrain.org>
526:      *
527:      * @param   string   input string
528:      * @param   string   string of characters to remove
529:      *
530:      * @return string
531:      */
532:     public static function rtrim($str, $charlist = NULL)
533:     {
534:         if ($charlist === NULL) {
535:             return rtrim($str);
536:         }
537: 
538:         $charlist = preg_replace('#[-\[\]:\\\\^/]#', '\\\\$0', $charlist);
539: 
540:         return preg_replace('/[' . $charlist . ']++$/uD', '', $str);
541:     }
542: 
543:     /**
544:      * Returns the unicode ordinal for a character.
545:      * @see    http://php.net/ord
546:      *
547:      * @author Harry Fuecks <hfuecks@gmail.com>
548:      *
549:      * @param   string   UTF-8 encoded character
550:      *
551:      * @return integer
552:      */
553:     public static function ord($chr)
554:     {
555:         $ord0 = ord($chr);
556: 
557:         if ($ord0 >= 0 AND $ord0 <= 127) {
558:             return $ord0;
559:         }
560: 
561:         if (!isset($chr[1])) {
562:             trigger_error('Short sequence - at least 2 bytes expected, only 1 seen', E_USER_WARNING);
563: 
564:             return FALSE;
565:         }
566: 
567:         $ord1 = ord($chr[1]);
568: 
569:         if ($ord0 >= 192 AND $ord0 <= 223) {
570:             return ($ord0 - 192) * 64 + ($ord1 - 128);
571:         }
572: 
573:         if (!isset($chr[2])) {
574:             trigger_error('Short sequence - at least 3 bytes expected, only 2 seen', E_USER_WARNING);
575: 
576:             return FALSE;
577:         }
578: 
579:         $ord2 = ord($chr[2]);
580: 
581:         if ($ord0 >= 224 AND $ord0 <= 239) {
582:             return ($ord0 - 224) * 4096 + ($ord1 - 128) * 64 + ($ord2 - 128);
583:         }
584: 
585:         if (!isset($chr[3])) {
586:             trigger_error('Short sequence - at least 4 bytes expected, only 3 seen', E_USER_WARNING);
587: 
588:             return FALSE;
589:         }
590: 
591:         $ord3 = ord($chr[3]);
592: 
593:         if ($ord0 >= 240 AND $ord0 <= 247) {
594:             return ($ord0 - 240) * 262144 + ($ord1 - 128) * 4096 + ($ord2 - 128) * 64 + ($ord3 - 128);
595:         }
596: 
597:         if (!isset($chr[4])) {
598:             trigger_error('Short sequence - at least 5 bytes expected, only 4 seen', E_USER_WARNING);
599: 
600:             return FALSE;
601:         }
602: 
603:         $ord4 = ord($chr[4]);
604: 
605:         if ($ord0 >= 248 AND $ord0 <= 251) {
606:             return ($ord0 - 248) * 16777216 + ($ord1 - 128) * 262144 + ($ord2 - 128) * 4096 + ($ord3 - 128) * 64 + ($ord4 - 128);
607:         }
608: 
609:         if (!isset($chr[5])) {
610:             trigger_error('Short sequence - at least 6 bytes expected, only 5 seen', E_USER_WARNING);
611: 
612:             return FALSE;
613:         }
614: 
615:         if ($ord0 >= 252 AND $ord0 <= 253) {
616:             return ($ord0 - 252) * 1073741824 + ($ord1 - 128) * 16777216 + ($ord2 - 128) * 262144 + ($ord3 - 128) * 4096 + ($ord4 - 128) * 64 + (ord($chr[5]) - 128);
617:         }
618: 
619:         if ($ord0 >= 254 AND $ord0 <= 255) {
620:             trigger_error('Invalid UTF-8 with surrogate ordinal ' . $ord0, E_USER_WARNING);
621: 
622:             return FALSE;
623:         }
624:     }
625: 
626:     /**
627:      * Takes an UTF-8 string and returns an array of ints representing the Unicode characters.
628:      * Astral planes are supported i.e. the ints in the output can be > 0xFFFF.
629:      * Occurrances of the BOM are ignored. Surrogates are not allowed.
630:      *
631:      * The Original Code is Mozilla Communicator client code.
632:      * The Initial Developer of the Original Code is Netscape Communications Corporation.
633:      * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
634:      * Ported to PHP by Henri Sivonen <hsivonen@iki                                   .fi>, see http://hsivonen.iki.fi/php-utf8/.
635:      * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
636:      *
637:      * @param   string   UTF-8 encoded string
638:      *
639:      * @return array   unicode code points
640:      * @return boolean FALSE if the string is invalid
641:      */
642:     public static function to_unicode($str)
643:     {
644:         $mState = 0; // cached expected number of octets after the current octet until the beginning of the next UTF8 character sequence
645:         $mUcs4 = 0; // cached Unicode character
646:         $mBytes = 1; // cached expected number of octets in the current sequence
647: 
648:         $out = array();
649: 
650:         $len = strlen($str);
651: 
652:         for ($i = 0; $i < $len; $i++) {
653:             $in = ord($str[$i]);
654: 
655:             if ($mState == 0) {
656:                 // When mState is zero we expect either a US-ASCII character or a
657:                 // multi-octet sequence.
658:                 if (0 == (0x80 & $in)) {
659:                     // US-ASCII, pass straight through.
660:                     $out[] = $in;
661:                     $mBytes = 1;
662:                 } elseif (0xC0 == (0xE0 & $in)) {
663:                     // First octet of 2 octet sequence
664:                     $mUcs4 = $in;
665:                     $mUcs4 = ($mUcs4 & 0x1F) << 6;
666:                     $mState = 1;
667:                     $mBytes = 2;
668:                 } elseif (0xE0 == (0xF0 & $in)) {
669:                     // First octet of 3 octet sequence
670:                     $mUcs4 = $in;
671:                     $mUcs4 = ($mUcs4 & 0x0F) << 12;
672:                     $mState = 2;
673:                     $mBytes = 3;
674:                 } elseif (0xF0 == (0xF8 & $in)) {
675:                     // First octet of 4 octet sequence
676:                     $mUcs4 = $in;
677:                     $mUcs4 = ($mUcs4 & 0x07) << 18;
678:                     $mState = 3;
679:                     $mBytes = 4;
680:                 } elseif (0xF8 == (0xFC & $in)) {
681:                     // First octet of 5 octet sequence.
682:                     //
683:                     // This is illegal because the encoded codepoint must be either
684:                     // (a) not the shortest form or
685:                     // (b) outside the Unicode range of 0-0x10FFFF.
686:                     // Rather than trying to resynchronize, we will carry on until the end
687:                     // of the sequence and let the later error handling code catch it.
688:                     $mUcs4 = $in;
689:                     $mUcs4 = ($mUcs4 & 0x03) << 24;
690:                     $mState = 4;
691:                     $mBytes = 5;
692:                 } elseif (0xFC == (0xFE & $in)) {
693:                     // First octet of 6 octet sequence, see comments for 5 octet sequence.
694:                     $mUcs4 = $in;
695:                     $mUcs4 = ($mUcs4 & 1) << 30;
696:                     $mState = 5;
697:                     $mBytes = 6;
698:                 } else {
699:                     // Current octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence.
700:                     trigger_error('utf8::to_unicode: Illegal sequence identifier in UTF-8 at byte ' . $i, E_USER_WARNING);
701: 
702:                     return FALSE;
703:                 }
704:             } else {
705:                 // When mState is non-zero, we expect a continuation of the multi-octet sequence
706:                 if (0x80 == (0xC0 & $in)) {
707:                     // Legal continuation
708:                     $shift = ($mState - 1) * 6;
709:                     $tmp = $in;
710:                     $tmp = ($tmp & 0x0000003F) << $shift;
711:                     $mUcs4 |= $tmp;
712: 
713:                     // End of the multi-octet sequence. mUcs4 now contains the final Unicode codepoint to be output
714:                     if (0 == --$mState) {
715:                         // Check for illegal sequences and codepoints
716:                         // From Unicode 3.1, non-shortest form is illegal
717:                         if (((2 == $mBytes) AND ($mUcs4 < 0x0080)) OR
718:                             ((3 == $mBytes) AND ($mUcs4 < 0x0800)) OR
719:                             ((4 == $mBytes) AND ($mUcs4 < 0x10000)) OR
720:                             (4 < $mBytes) OR
721:                             // From Unicode 3.2, surrogate characters are illegal
722:                             (($mUcs4 & 0xFFFFF800) == 0xD800) OR
723:                             // Codepoints outside the Unicode range are illegal
724:                             ($mUcs4 > 0x10FFFF)
725:                         ) {
726:                             trigger_error('utf8::to_unicode: Illegal sequence or codepoint in UTF-8 at byte ' . $i, E_USER_WARNING);
727: 
728:                             return FALSE;
729:                         }
730: 
731:                         if (0xFEFF != $mUcs4) {
732:                             // BOM is legal but we don't want to output it
733:                             $out[] = $mUcs4;
734:                         }
735: 
736:                         // Initialize UTF-8 cache
737:                         $mState = 0;
738:                         $mUcs4 = 0;
739:                         $mBytes = 1;
740:                     }
741:                 } else {
742:                     // ((0xC0 & (*in) != 0x80) AND (mState != 0))
743:                     // Incomplete multi-octet sequence
744:                     trigger_error('joosString::to_unicode: Incomplete multi-octet sequence in UTF-8 at byte ' . $i, E_USER_WARNING);
745: 
746:                     return FALSE;
747:                 }
748:             }
749:         }
750: 
751:         return $out;
752:     }
753: 
754:     /**
755:      * Takes an array of ints representing the Unicode characters and returns a UTF-8 string.
756:      * Astral planes are supported i.e. the ints in the input can be > 0xFFFF.
757:      * Occurrances of the BOM are ignored. Surrogates are not allowed.
758:      *
759:      * The Original Code is Mozilla Communicator client code.
760:      * The Initial Developer of the Original Code is Netscape Communications Corporation.
761:      * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
762:      * Ported to PHP by Henri Sivonen <hsivonen@iki                                   .fi>, see http://hsivonen.iki.fi/php-utf8/.
763:      * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
764:      *
765:      * @param   array    unicode code points representing a string
766:      *
767:      * @return string  utf8 string of characters
768:      * @return boolean FALSE if a code point cannot be found
769:      */
770:     public static function from_unicode($arr)
771:     {
772:         ob_start();
773: 
774:         $keys = array_keys($arr);
775: 
776:         foreach ($keys as $k) {
777:             // ASCII range (including control chars)
778:             if (($arr[$k] >= 0) AND ($arr[$k] <= 0x007f)) {
779:                 echo chr($arr[$k]);
780:             } // 2 byte sequence
781:             elseif ($arr[$k] <= 0x07ff) {
782:                 echo chr(0xc0 | ($arr[$k] >> 6));
783:                 echo chr(0x80 | ($arr[$k] & 0x003f));
784:             } // Byte order mark (skip)
785:             elseif ($arr[$k] == 0xFEFF) {
786:                 // nop -- zap the BOM
787:             } // Test for illegal surrogates
788:             elseif ($arr[$k] >= 0xD800 AND $arr[$k] <= 0xDFFF) {
789:                 // Found a surrogate
790:                 trigger_error('utf8::from_unicode: Illegal surrogate at index: ' . $k . ', value: ' . $arr[$k], E_USER_WARNING);
791: 
792:                 return FALSE;
793:             } // 3 byte sequence
794:             elseif ($arr[$k] <= 0xffff) {
795:                 echo chr(0xe0 | ($arr[$k] >> 12));
796:                 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
797:                 echo chr(0x80 | ($arr[$k] & 0x003f));
798:             } // 4 byte sequence
799:             elseif ($arr[$k] <= 0x10ffff) {
800:                 echo chr(0xf0 | ($arr[$k] >> 18));
801:                 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
802:                 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
803:                 echo chr(0x80 | ($arr[$k] & 0x3f));
804:             } // Out of range
805:             else {
806:                 trigger_error('utf8::from_unicode: Codepoint out of Unicode range at index: ' . $k . ', value: ' . $arr[$k], E_USER_WARNING);
807: 
808:                 return FALSE;
809:             }
810:         }
811: 
812:         return ob_get_clean();
813:     }
814: 
815:     public static function to_utf8(&$text)
816:     {
817:         if (is_array($text) OR is_object($text)) {
818:             $d = array();
819:             foreach ($text as $k => &$v) {
820:                 $d[self::to_utf8($k)] = self::to_utf8($v);
821:             }
822: 
823:             return $d;
824:         }
825:         if (is_string($text)) {
826:             if (self::is_utf8($text)) { // если это юникод - сразу его возвращаем
827: 
828:                 return $text;
829:             }
830:             if (function_exists('iconv')) { // пробуем конвертировать через iconv
831: 
832:                 return iconv('cp1251', 'utf-8//IGNORE//TRANSLIT', $text);
833:             }
834: 
835:             throw new joosException('Перекодировка не поддерживается');
836:         }
837: 
838:         return $text;
839:     }
840: 
841:     /* проверка на юникод */
842:     public static function is_utf8(&$data, $is_strict = true)
843:     {
844:         if (is_array($data)) { // массив
845:             foreach ($data as &$v) {
846:                 if (!self::is_utf8($v, $is_strict)) {
847:                     return false;
848:                 }
849:             }
850: 
851:             return true;
852:         } elseif (is_string($data)) { // строка
853:             if (function_exists('iconv')) {
854:                 $distance = strlen($data) - strlen(iconv('UTF-8', 'UTF-8//IGNORE', $data));
855:                 if ($distance > 0) {
856:                     return false;
857:                 }
858:                 if ($is_strict && preg_match('/[^\x09\x0A\x0D\x20-\xFF]/sS', $data)) {
859:                     return false;
860:                 }
861: 
862:                 return true;
863:             }
864: 
865:             return self::utf8_check($data, $is_strict);
866:         } elseif (is_scalar($data) || $data===null) { //числа, булево и ничего
867: 
868:             return true;
869:         }
870: 
871:         return false;
872:     }
873: 
874:     /* проверка на юникод */
875:     public static function utf8_check($str, $is_strict = true)
876:     {
877:         for ($i = 0, $len = strlen($str); $i < $len; $i++) {
878:             $c = ord($str[$i]);
879:             if ($c < 0x80) {
880:                 if ($is_strict === false || ($c > 0x1F && $c < 0x7F) || $c == 0x09 || $c == 0x0A || $c == 0x0D) {
881:                     continue;
882:                 }
883:             }
884:             if (($c & 0xE0) == 0xC0) {
885:                 $n = 1;
886:             } elseif (($c & 0xF0) == 0xE0) {
887:                 $n = 2;
888:             } elseif (($c & 0xF8) == 0xF0) {
889:                 $n = 3;
890:             } elseif (($c & 0xFC) == 0xF8) {
891:                 $n = 4;
892:             } elseif (($c & 0xFE) == 0xFC) {
893:                 $n = 5;
894:             } else {
895:                 return false;
896:             }
897:             for ($j = 0; $j < $n; $j++) {
898:                 $i++;
899:                 if ($i == $len || ((ord($str[$i]) & 0xC0) != 0x80)) {
900:                     return false;
901:                 }
902:             }
903:         }
904: 
905:         return true;
906:     }
907: 
908: }
909:
Packages

Classes