Source
File: wp-includes/wp-db.php
protected function strip_invalid_text( $data ) {
$db_check_string = false;
foreach ( $data as &$value ) {
$charset = $value['charset'];
if ( is_array( $value['length'] ) ) {
$length = $value['length']['length'];
$truncate_by_byte_length = 'byte' === $value['length']['type'];
} else {
$length = false;
// Since we have no length, we'll never truncate. Initialize the variable to false.
// True would take us through an unnecessary (for this case) codepath below.
$truncate_by_byte_length = false;
}
// There's no charset to work with.
if ( false === $charset ) {
continue;
}
// Column isn't a string.
if ( ! is_string( $value['value'] ) ) {
continue;
}
$needs_validation = true;
if (
// latin1 can store any byte sequence.
'latin1' === $charset
||
// ASCII is always OK.
( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) )
) {
$truncate_by_byte_length = true;
$needs_validation = false;
}
if ( $truncate_by_byte_length ) {
mbstring_binary_safe_encoding();
if ( false !== $length && strlen( $value['value'] ) > $length ) {
$value['value'] = substr( $value['value'], 0, $length );
}
reset_mbstring_encoding();
if ( ! $needs_validation ) {
continue;
}
}
// utf8 can be handled by regex, which is a bunch faster than a DB lookup.
if ( ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) && function_exists( 'mb_strlen' ) ) {
$regex = '/
(
(?: [\x00-\x7F] # single-byte sequences 0xxxxxxx
| [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx
| \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2
| [\xE1-\xEC][\x80-\xBF]{2}
| \xED[\x80-\x9F][\x80-\xBF]
| [\xEE-\xEF][\x80-\xBF]{2}';
if ( 'utf8mb4' === $charset ) {
$regex .= '
| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3
| [\xF1-\xF3][\x80-\xBF]{3}
| \xF4[\x80-\x8F][\x80-\xBF]{2}
';
}
$regex .= '){1,40} # ...one or more times
)
| . # anything else
/x';
$value['value'] = preg_replace( $regex, '$1', $value['value'] );
if ( false !== $length && mb_strlen( $value['value'], 'UTF-8' ) > $length ) {
$value['value'] = mb_substr( $value['value'], 0, $length, 'UTF-8' );
}
continue;
}
// We couldn't use any local conversions, send it to the DB.
$value['db'] = true;
$db_check_string = true;
}
unset( $value ); // Remove by reference.
if ( $db_check_string ) {
$queries = array();
foreach ( $data as $col => $value ) {
if ( ! empty( $value['db'] ) ) {
// We're going to need to truncate by characters or bytes, depending on the length value we have.
if ( isset( $value['length']['type'] ) && 'byte' === $value['length']['type'] ) {
// Using binary causes LEFT() to truncate by bytes.
$charset = 'binary';
} else {
$charset = $value['charset'];
}
if ( $this->charset ) {
$connection_charset = $this->charset;
} else {
if ( $this->use_mysqli ) {
$connection_charset = mysqli_character_set_name( $this->dbh );
} else {
$connection_charset = mysql_client_encoding();
}
}
if ( is_array( $value['length'] ) ) {
$length = sprintf( '%.0f', $value['length']['length'] );
$queries[ $col ] = $this->prepare( "CONVERT( LEFT( CONVERT( %s USING $charset ), $length ) USING $connection_charset )", $value['value'] );
} elseif ( 'binary' !== $charset ) {
// If we don't have a length, there's no need to convert binary - it will always return the same result.
$queries[ $col ] = $this->prepare( "CONVERT( CONVERT( %s USING $charset ) USING $connection_charset )", $value['value'] );
}
unset( $data[ $col ]['db'] );
}
}
$sql = array();
foreach ( $queries as $column => $query ) {
if ( ! $query ) {
continue;
}
$sql[] = $query . " AS x_$column";
}
$this->check_current_query = false;
$row = $this->get_row( 'SELECT ' . implode( ', ', $sql ), ARRAY_A );
if ( ! $row ) {
return new WP_Error( 'wpdb_strip_invalid_text_failure' );
}
foreach ( array_keys( $data ) as $column ) {
if ( isset( $row[ "x_$column" ] ) ) {
$data[ $column ]['value'] = $row[ "x_$column" ];
}
}
}
return $data;
}