tesseract 4.1.1
Loading...
Searching...
No Matches
intsimdmatrix.h
Go to the documentation of this file.
1
2// File: intsimdmatrix.h
3// Description: Base class for 8-bit int SIMD matrix multipliers.
4// Author: Ray Smith
5// Created: Tue Aug 15 07:37:20 PST 2017
6//
7// (C) Copyright 2017, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
18
19#ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_
20#define TESSERACT_ARCH_INTSIMDMATRIX_H_
21
22#include <cstdint>
23#include <vector>
24
25template <class T>
27template <typename T>
28class GenericVector;
29
30namespace tesseract {
31
32// Base class for a SIMD function to multiply a matrix by a vector, with sources
33// of 8-bit signed integer, and result in a double, after appropriate scaling.
34// Assumes a specific method of multiplication that can be applied to any size
35// and number of SIMD registers as follows:
36// int32_t results are computed with num_outputs_per_register_ in each of
37// max_output_registers_ result registers, repeatedly until it would make too
38// many results, then the number of registers is halved, and so-on down to a
39// single result register. The last calculation only outputs the required number
40// of results instead of writing beyond the bounds. Eg: matrix has 75 outputs,
41// num_outputs_per_register_ = 4, and max_output_registers_ = 8,
42// Step 1: 8x4=32 results are computed,
43// Step 2: 8x4=32 again, total 64,
44// Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72,
45// Step 4: 1x3, total 75.
46// Each step above is computed using a PartialFunc, which runs over the input
47// vector once. The input is read one registerful of num_inputs_per_register_
48// at a time (presumably 4x num_outputs_per_register_ since they are int8_t)
49// so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_.
50// Since it is slow (on Intel at least) to horizontally add in a register,
51// provision is made to process num_inputs_per_group_ inputs at a time, with
52// the group being replicated num_input_groups_ times and multiplied by a
53// num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix.
54// This is most convenient if num_inputs_per_group_ is 4, and the product
55// sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent
56// results in the process, but it doesn't have to be implemented that way.
57// The weights are re-ordered by Init() to be used sequentially by the above
58// algorithm, followed by the biases, so they can be added at the end.
59// The base class computes the base C++ implementation.
60// NOTE that, although the subclasses execute on different SIMD hardware, no
61// virtual methods are needed, as the constructor sets up everything that
62// is required to allow the base class implementation to do all the work.
64 // Computes a reshaped copy of the weight matrix w.
65 void Init(const GENERIC_2D_ARRAY<int8_t>& w,
66 std::vector<int8_t>& shaped_w) const;
67
68 // Rounds the size up to a multiple of the input register size (in int8_t).
69 int RoundInputs(int size) const {
71 }
72 // Rounds the size up to a multiple of the output register size (in int32_t).
73 int RoundOutputs(int size) const {
75 }
76
77 // Computes matrix.vector v = Wu.
78 // u is of size W.dim2() - 1 and the output v is of size W.dim1().
79 // u is imagined to have an extra element at the end with value 1, to
80 // implement the bias, but it doesn't actually have it.
81 // Computes the base C++ implementation.
82 static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w,
83 const GenericVector<double>& scales,
84 const int8_t* u, double* v);
85
86 // Rounds the input up to a multiple of the given factor.
87 static int Roundup(int input, int factor) {
88 return (input + factor - 1) / factor * factor;
89 }
90
91 // Computes matrix.vector v = Wu.
92 // u is of size W.dim2() - 1 and the output v is of size W.dim1().
93 // u is imagined to have an extra element at the end with value 1, to
94 // implement the bias, but it doesn't actually have it.
95 // Uses an optimized implementation with partial funcs.
96 // NOTE: The size of the input vector (u) must be padded using
97 // RoundInputs above.
98 // The input will be over-read to the extent of the padding. There are no
99 // alignment requirements.
100 using MatrixDotVectorFunction = void (*)(int, int, const int8_t*,
101 const double*, const int8_t*,
102 double*);
104
105 // Number of 32 bit outputs held in each register.
107 // Maximum number of registers that we will use to hold outputs.
109 // Number of 8 bit inputs in the inputs register.
111 // Number of inputs in each weight group.
113 // Number of groups of inputs to be broadcast.
114 // num_input_groups_ = num_inputs_per_register_ / num_inputs_per_group_
115
119};
120
121} // namespace tesseract
122
123#endif // TESSERACT_ARCH_INTSIMDMATRIX_H_
void(*)(int, int, const int8_t *, const double *, const int8_t *, double *) MatrixDotVectorFunction
static const IntSimdMatrix intSimdMatrixAVX2
static void MatrixDotVector(const GENERIC_2D_ARRAY< int8_t > &w, const GenericVector< double > &scales, const int8_t *u, double *v)
int RoundOutputs(int size) const
Definition: intsimdmatrix.h:73
int RoundInputs(int size) const
Definition: intsimdmatrix.h:69
MatrixDotVectorFunction matrixDotVectorFunction
static int Roundup(int input, int factor)
Definition: intsimdmatrix.h:87
static const IntSimdMatrix * intSimdMatrix
static const IntSimdMatrix intSimdMatrixSSE
void Init(const GENERIC_2D_ARRAY< int8_t > &w, std::vector< int8_t > &shaped_w) const