很常见的一个需求,但说实话,想要完全检测出来也不现实,只能求得个大概了,下面是从网上找的一些方法以及对应的参考链接:
方法一:
<?php define('_is_utf8_split',5000); function is_utf8($string) { // v1.01 if (strlen($string) > _is_utf8_split) { // Based on: http://mobile-website.mobi/php-utf8-vs-iso-8859-1-59 for ($i=0,$s=_is_utf8_split,$j=ceil(strlen($string)/_is_utf8_split);$i < $j;$i++,$s+=_is_utf8_split) { if (is_utf8(substr($string,$s,_is_utf8_split))) return true; } return false; } else { // From http://w3.org/International/questions/qa-forms-utf-8.html return preg_match('%^(?: [x09x0Ax0Dx20-x7E] # ASCII | [xC2-xDF][x80-xBF] # non-overlong 2-byte | xE0[xA0-xBF][x80-xBF] # excluding overlongs | [xE1-xECxEExEF][x80-xBF]{2} # straight 3-byte | xED[x80-x9F][x80-xBF] # excluding surrogates | xF0[x90-xBF][x80-xBF]{2} # planes 1-3 | [xF1-xF3][x80-xBF]{3} # planes 4-15 | xF4[x80-x8F][x80-xBF]{2} # plane 16 )*$%xs', $string); } } var_dump(is_utf8('haha'));
方法二:
function detectUTF8($string) { return preg_match('%(?: [xC2-xDF][x80-xBF] # non-overlong 2-byte |xE0[xA0-xBF][x80-xBF] # excluding overlongs |[xE1-xECxEExEF][x80-xBF]{2} # straight 3-byte |xED[x80-x9F][x80-xBF] # excluding surrogates |xF0[x90-xBF][x80-xBF]{2} # planes 1-3 |[xF1-xF3][x80-xBF]{3} # planes 4-15 |xF4[x80-x8F][x80-xBF]{2} # plane 16 )+%xs', $string); }
还从网上找了个判断GB2312的PHP函数,不过还没测试正确性,先记录一下吧:
function isGb2312($string) { for($i=0; $i<=127; $i++) { if( ($v >= 228) && ($v < = 233) ){ if( ($i+2) >= (strlen($string) - 1)) return true; $v1 = ord( $string[$i+1] ); $v2 = ord( $string[$i+2] ); if( ($v1 >= 128) && ($v1 < =191) && ($v2 >=128) && ($v2 < = 191) ) return false; else return true; } } return true; }