tesseract 4.1.1
Loading...
Searching...
No Matches
dotproductsse.cpp
Go to the documentation of this file.
1
2// File: dotproductsse.cpp
3// Description: Architecture-specific dot-product function.
4// Author: Ray Smith
5//
6// (C) Copyright 2015, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
17
18#if !defined(__SSE4_1__)
19#error Implementation only for SSE 4.1 capable architectures
20#endif
21
22#include <emmintrin.h>
23#include <smmintrin.h>
24#include <cstdint>
25#include "dotproduct.h"
26
27namespace tesseract {
28
29// Computes and returns the dot product of the n-vectors u and v.
30// Uses Intel SSE intrinsics to access the SIMD instruction set.
31double DotProductSSE(const double* u, const double* v, int n) {
32 int max_offset = n - 2;
33 int offset = 0;
34 // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
35 // v, and multiplying them together in parallel.
36 __m128d sum = _mm_setzero_pd();
37 if (offset <= max_offset) {
38 offset = 2;
39 // Aligned load is reputedly faster but requires 16 byte aligned input.
40 if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
41 (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
42 // Use aligned load.
43 sum = _mm_load_pd(u);
44 __m128d floats2 = _mm_load_pd(v);
45 // Multiply.
46 sum = _mm_mul_pd(sum, floats2);
47 while (offset <= max_offset) {
48 __m128d floats1 = _mm_load_pd(u + offset);
49 floats2 = _mm_load_pd(v + offset);
50 offset += 2;
51 floats1 = _mm_mul_pd(floats1, floats2);
52 sum = _mm_add_pd(sum, floats1);
53 }
54 } else {
55 // Use unaligned load.
56 sum = _mm_loadu_pd(u);
57 __m128d floats2 = _mm_loadu_pd(v);
58 // Multiply.
59 sum = _mm_mul_pd(sum, floats2);
60 while (offset <= max_offset) {
61 __m128d floats1 = _mm_loadu_pd(u + offset);
62 floats2 = _mm_loadu_pd(v + offset);
63 offset += 2;
64 floats1 = _mm_mul_pd(floats1, floats2);
65 sum = _mm_add_pd(sum, floats1);
66 }
67 }
68 }
69 // Add the 2 sums in sum horizontally.
70 sum = _mm_hadd_pd(sum, sum);
71 // Extract the low result.
72 double result = _mm_cvtsd_f64(sum);
73 // Add on any left-over products.
74 while (offset < n) {
75 result += u[offset] * v[offset];
76 ++offset;
77 }
78 return result;
79}
80
81} // namespace tesseract.
double DotProductSSE(const double *u, const double *v, int n)