Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions be/src/core/column/column_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "util/simd/bits.h"
#include "util/simd/vstring_function.h"
#include "util/unaligned.h"
#include "util/utf8_check.h"
namespace doris {

template <typename T>
Expand Down Expand Up @@ -759,6 +760,11 @@ bool ColumnStr<T>::is_ascii() const {
return simd::VStringFunctions::is_ascii(StringRef(chars.data(), chars.size()));
}

template <typename T>
bool ColumnStr<T>::is_valid_utf8() const {
return validate_utf8(reinterpret_cast<const char*>(chars.data()), chars.size());
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stores row payloads contiguously without any delimiter, so UTF-8 validity is not compositional here. Validating the whole buffer can return even when an individual row is invalid. A concrete case is rows "\xE4" and "\xB8\x96": each row is invalid by itself, but the concatenated buffer forms the valid UTF-8 sequence for , so this helper would incorrectly accept the column. Since this API is introduced as column-level UTF-8 validation, it needs to walk and validate each row independently.


template class ColumnStr<uint32_t>;
template class ColumnStr<uint64_t>;
} // namespace doris
4 changes: 4 additions & 0 deletions be/src/core/column/column_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ class Arena;
class ColumnSorter;

/** Column for String values.
* Note: In string functions, we assume that ColumnStr contains valid UTF-8 encoded data.
* However, ColumnStr is not guaranteed to always hold valid UTF-8, since it is also used
* as a serialization container where the content may be arbitrary binary data.
*/
template <typename T>
class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> {
Expand Down Expand Up @@ -536,6 +539,7 @@ class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> {
}

bool is_ascii() const;
bool is_valid_utf8() const;

Chars& get_chars() { return chars; }
const Chars& get_chars() const { return chars; }
Expand Down
27 changes: 27 additions & 0 deletions be/src/exprs/function/function_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include "exprs/function/string_hex_util.h"
#include "util/string_search.hpp"
#include "util/url_coding.h"
#include "util/utf8_check.h"

namespace doris {
struct NameStringASCII {
Expand Down Expand Up @@ -225,6 +226,29 @@ struct StringUtf8LengthImpl {
}
};

struct NameIsValidUTF8 {
static constexpr auto name = "is_valid_utf8";
};

struct IsValidUTF8Impl {
using ReturnType = DataTypeUInt8;
static constexpr auto PrimitiveTypeImpl = PrimitiveType::TYPE_STRING;
using Type = String;
using ReturnColumnType = ColumnUInt8;

static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
PaddedPODArray<UInt8>& res) {
auto size = offsets.size();
res.resize(size);
for (size_t i = 0; i < size; ++i) {
const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
size_t str_size = offsets[i] - offsets[i - 1];
res[i] = validate_utf8(raw_str, str_size) ? 1 : 0;
}
return Status::OK();
}
};

struct NameStartsWith {
static constexpr auto name = "starts_with";
};
Expand Down Expand Up @@ -1316,6 +1340,7 @@ using FunctionStringLength = FunctionUnaryToType<StringLengthImpl, NameStringLen
using FunctionCrc32 = FunctionUnaryToType<Crc32Impl, NameCrc32>;
using FunctionStringUTF8Length = FunctionUnaryToType<StringUtf8LengthImpl, NameStringUtf8Length>;
using FunctionStringSpace = FunctionUnaryToType<StringSpace, NameStringSpace>;
using FunctionIsValidUTF8 = FunctionUnaryToType<IsValidUTF8Impl, NameIsValidUTF8>;
using FunctionStringStartsWith =
FunctionBinaryToType<DataTypeString, DataTypeString, StringStartsWithImpl, NameStartsWith>;
using FunctionStringEndsWith =
Expand Down Expand Up @@ -1422,7 +1447,9 @@ void register_function_string(SimpleFunctionFactory& factory) {
factory.register_function<FunctionSubReplace<SubReplaceThreeImpl>>();
factory.register_function<FunctionSubReplace<SubReplaceFourImpl>>();
factory.register_function<FunctionOverlay>();
factory.register_function<FunctionIsValidUTF8>();

factory.register_alias(FunctionIsValidUTF8::name, "isValidUTF8");
factory.register_alias(FunctionToLower::name, "lcase");
factory.register_alias(FunctionToUpper::name, "ucase");
factory.register_alias(FunctionStringUTF8Length::name, "character_length");
Expand Down
75 changes: 75 additions & 0 deletions be/test/core/column/column_string_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1425,4 +1425,79 @@ TEST_F(ColumnStringTest, is_ascii) {
}
}

TEST_F(ColumnStringTest, is_valid_utf8) {
// all ASCII strings are valid UTF-8
{
auto column = ColumnString::create();
column->insert_data("hello", 5);
column->insert_data("world", 5);
column->insert_data("123!@#", 6);
EXPECT_TRUE(column->is_valid_utf8());
}
// empty column is valid
{
auto column = ColumnString::create();
EXPECT_TRUE(column->is_valid_utf8());
}
// empty strings are valid UTF-8
{
auto column = ColumnString::create();
column->insert_data("", 0);
column->insert_data("", 0);
EXPECT_TRUE(column->is_valid_utf8());
}
// multi-byte UTF-8 characters
{
auto column = ColumnString::create();
column->insert_data("Hello, 世界", strlen("Hello, 世界"));
column->insert_data("こんにちは", strlen("こんにちは"));
column->insert_data("😀", strlen("😀"));
EXPECT_TRUE(column->is_valid_utf8());
}
// invalid: lone continuation byte 0x80
{
auto column = ColumnString::create();
const char data[] = {'\x80'};
column->insert_data(data, 1);
EXPECT_FALSE(column->is_valid_utf8());
}
// invalid: bad 2-byte sequence 0xC3 0x28
{
auto column = ColumnString::create();
const char data[] = {'\xc3', '\x28'};
column->insert_data(data, 2);
EXPECT_FALSE(column->is_valid_utf8());
}
// invalid: overlong encoding 0xC0 0xAF
{
auto column = ColumnString::create();
const char data[] = {'\xc0', '\xaf'};
column->insert_data(data, 2);
EXPECT_FALSE(column->is_valid_utf8());
}
// invalid: 0xFE byte
{
auto column = ColumnString::create();
const char data[] = {'\xfe'};
column->insert_data(data, 1);
EXPECT_FALSE(column->is_valid_utf8());
}
// invalid: truncated 3-byte sequence 0xE4 0xB8
{
auto column = ColumnString::create();
const char data[] = {'\xe4', '\xb8'};
column->insert_data(data, 2);
EXPECT_FALSE(column->is_valid_utf8());
}
// mixed: one invalid byte makes the whole column invalid
{
auto column = ColumnString::create();
column->insert_data("hello", 5);
const char bad[] = {'\xff'};
column->insert_data(bad, 1);
column->insert_data("world", 5);
EXPECT_FALSE(column->is_valid_utf8());
}
}

} // namespace doris
30 changes: 30 additions & 0 deletions be/test/exprs/function/function_string_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -994,6 +994,36 @@ TEST(function_string_test, function_ascii_test) {
check_function_all_arg_comb<DataTypeInt32, true>(func_name, input_types, data_set);
}

TEST(function_string_test, function_is_valid_utf8_test) {
std::string func_name = "is_valid_utf8";

InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR};

DataSet data_set = {
// valid UTF-8 strings
{{std::string("hello")}, std::uint8_t(1)},
{{std::string("")}, std::uint8_t(1)},
{{std::string("Hello, 世界")}, std::uint8_t(1)},
{{std::string("こんにちは")}, std::uint8_t(1)},
{{std::string("123!@#")}, std::uint8_t(1)},
{{std::string("\xc3\xb1")}, std::uint8_t(1)}, // ñ
{{std::string("\xe2\x82\xac")}, std::uint8_t(1)}, // €
{{std::string("\xf0\x9f\x98\x80")}, std::uint8_t(1)}, // 😀
// invalid UTF-8 strings
{{std::string("\x80")}, std::uint8_t(0)}, // invalid leading byte
{{std::string("\xc3\x28")}, std::uint8_t(0)}, // invalid 2-byte sequence
{{std::string("\xe2\x28\xa1")}, std::uint8_t(0)}, // invalid 3-byte sequence
{{std::string("\xf0\x28\x8c\xbc")}, std::uint8_t(0)}, // invalid 4-byte sequence
{{std::string("\xfe")}, std::uint8_t(0)}, // invalid byte 0xFE
{{std::string("\xff")}, std::uint8_t(0)}, // invalid byte 0xFF
{{std::string("abc\xc0\xaf")}, std::uint8_t(0)}, // overlong encoding
// NULL
{{Null()}, Null()},
};

check_function_all_arg_comb<DataTypeUInt8, true>(func_name, input_types, data_set);
}

TEST(function_string_test, function_char_length_test) {
std::string func_name = "char_length";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsUuid;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains;
Expand Down Expand Up @@ -837,6 +838,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(IsIpAddressInRange.class, "is_ip_address_in_range"),
scalar(IsNan.class, "isnan"),
scalar(IsUuid.class, "is_uuid"),
scalar(IsValidUtf8.class, "is_valid_utf8", "isValidUTF8"),
scalar(IsInf.class, "isinf"),
scalar(Ipv4CIDRToRange.class, "ipv4_cidr_to_range"),
scalar(Ipv6CIDRToRange.class, "ipv6_cidr_to_range"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.trees.expressions.functions.scalar;

import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.BooleanType;
import org.apache.doris.nereids.types.StringType;
import org.apache.doris.nereids.types.VarcharType;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

import java.util.List;

/**
* ScalarFunction 'is_valid_utf8'.
*/
public class IsValidUtf8 extends ScalarFunction
implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable {

public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
FunctionSignature.ret(BooleanType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT),
FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE)
);

/**
* constructor with 1 argument.
*/
public IsValidUtf8(Expression arg) {
super("is_valid_utf8", arg);
}

/** constructor for withChildren and reuse signature */
private IsValidUtf8(ScalarFunctionParams functionParams) {
super(functionParams);
}

/**
* withChildren.
*/
@Override
public IsValidUtf8 withChildren(List<Expression> children) {
Preconditions.checkArgument(children.size() == 1);
return new IsValidUtf8(getFunctionParams(children));
}

@Override
public List<FunctionSignature> getSignatures() {
return SIGNATURES;
}

@Override
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
return visitor.visitIsValidUtf8(this, context);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsUuid;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains;
Expand Down Expand Up @@ -1701,6 +1702,10 @@ default R visitIsUuid(IsUuid isUuid, C context) {
return visitScalarFunction(isUuid, context);
}

default R visitIsValidUtf8(IsValidUtf8 isValidUtf8, C context) {
return visitScalarFunction(isValidUtf8, context);
}

default R visitIsInf(IsInf isInf, C context) {
return visitScalarFunction(isInf, context);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !valid_1 --
true

-- !valid_2 --
true

-- !valid_3 --
true

-- !valid_4 --
true

-- !valid_5 --
true

-- !null_1 --
\N

-- !invalid_1 --
false

-- !invalid_2 --
false

-- !invalid_3 --
false

-- !invalid_4 --
false

-- !invalid_5 --
false

-- !invalid_6 --
false

-- !invalid_7 --
false

-- !invalid_8 --
false

-- !alias_1 --
true

-- !alias_2 --
true

-- !alias_3 --
false

-- !table_1 --
1 true
2 true
3 true
4 \N
5 false
6 false

Loading
Loading