Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions tensorflow/lite/kernels/internal/reference/concatenation.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,78 @@ inline void Concatenation(const ConcatenationParams& params,
}
}

template <>
inline void Concatenation<Int4>(const ConcatenationParams& params,
const RuntimeShape* const* input_shapes,
const Int4* const* input_data,
const RuntimeShape& output_shape,
Int4* output_data) {
int axis = params.axis;
int inputs_count = params.inputs_count;
const int concat_dimensions = output_shape.DimensionsCount();
TFLITE_DCHECK_LT(axis, concat_dimensions);

int64_t concat_size = 0;
for (int i = 0; i < inputs_count; i++) {
TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
for (int j = 0; j < concat_dimensions; j++) {
if (j != axis) {
MatchingDim(*input_shapes[i], j, output_shape, j);
}
}
concat_size += input_shapes[i]->Dims(axis);
}
TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
int64_t outer_size = 1;
for (int i = 0; i < axis; ++i) {
outer_size *= output_shape.Dims(i);
}
// For all input arrays,
// FlatSize() = outer_size * Dims(axis) * base_inner_size;
int64_t base_inner_size = 1;
for (int i = axis + 1; i < concat_dimensions; ++i) {
base_inner_size *= output_shape.Dims(i);
}

uint8_t* output_ptr = reinterpret_cast<uint8_t*>(output_data);
// We can't guarantee that the output buffer is initialized to 0, so we have
// to clear it to ensure the high/low nibbles not currently being written are
// not garbage.
// Note: output_shape.FlatSize() gives number of elements (nibbles).
// Bytes needed: (elements + 1) / 2.
memset(output_ptr, 0, (output_shape.FlatSize() + 1) / 2);

int64_t output_offset = 0;
for (int k = 0; k < outer_size; k++) {
for (int i = 0; i < inputs_count; ++i) {
const int64_t copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
const uint8_t* input_ptr =
reinterpret_cast<const uint8_t*>(input_data[i]);
// The input_ptr points to the start of the tensor data.
// We need to calculate the offset for the current outer loop iteration
// 'k'.
// The tensor has total elements = outer_size * copy_size.
// So current offset in elements is k * copy_size.
int64_t input_offset = k * copy_size;

for (int j = 0; j < copy_size; ++j) {
int64_t in_idx = input_offset + j;
uint8_t val = input_ptr[in_idx / 2];
uint8_t nibble = (in_idx % 2 == 0) ? (val & 0x0F) : ((val >> 4) & 0x0F);

int64_t out_idx = output_offset + j;
uint8_t* out_byte = output_ptr + (out_idx / 2);
if (out_idx % 2 == 0) {
*out_byte = (*out_byte & 0xF0) | nibble;
} else {
*out_byte = (*out_byte & 0x0F) | (nibble << 4);
}
}
output_offset += copy_size;
}
}
}

// TODO(b/174275780): The quantized implementation of concatentation isn't fully
// quantized as it takes scale as a floating point value. This should be fixed
// when optimizng this routine further.
Expand Down
5 changes: 5 additions & 0 deletions tensorflow/lite/kernels/internal/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ enum class FusedActivationFunctionType : uint8_t {
};
enum class PaddingType : uint8_t { kNone, kSame, kValid };

// Empty struct to allow using a distinct type for Int4.
struct Int4 {
uint8_t val;
};

struct PaddingValues {
int16_t width;
int16_t height;
Expand Down