|
|
@ -1,11 +1,11 @@ |
|
|
|
use std::simd::Simd;
|
|
|
|
|
|
|
|
pub fn sum_f32_vector_simd(a: &[f32]) -> f32 {
|
|
|
|
let chunk_size = 4;
|
|
|
|
let chunk_size = 16;
|
|
|
|
let chunks = a.len() / chunk_size;
|
|
|
|
let mut sum = Simd::<f32, 4>::splat(0.0);
|
|
|
|
let mut sum = Simd::<f32, 16>::splat(0.0);
|
|
|
|
for i in 0..chunks {
|
|
|
|
let a = Simd::<f32, 4>::from_slice(&a[i * chunk_size..]);
|
|
|
|
let a = Simd::<f32, 16>::from_slice(&a[i * chunk_size..]);
|
|
|
|
sum += a;
|
|
|
|
}
|
|
|
|
let mut result = sum.as_array().iter().sum::<f32>();
|
|
|
@ -21,11 +21,11 @@ pub fn add_f32_vector_simd(a: &[f32], b: &[f32]) -> Vec<f32> { |
|
|
|
if a.is_empty() {
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
let chunk_size = 4;
|
|
|
|
let chunk_size = 16;
|
|
|
|
let chunks = a.len() / chunk_size;
|
|
|
|
for i in 0..chunks {
|
|
|
|
let a = Simd::<f32, 4>::from_slice(&a[i * chunk_size..]);
|
|
|
|
let b = Simd::<f32, 4>::from_slice(&b[i * chunk_size..]);
|
|
|
|
let a = Simd::<f32, 16>::from_slice(&a[i * chunk_size..]);
|
|
|
|
let b = Simd::<f32, 16>::from_slice(&b[i * chunk_size..]);
|
|
|
|
let c = a + b;
|
|
|
|
result.extend_from_slice(c.as_array());
|
|
|
|
}
|
|
|
@ -41,11 +41,11 @@ pub fn sub_f32_vector_simd(a: &[f32], b: &[f32]) -> Vec<f32> { |
|
|
|
if a.is_empty() {
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
let chunk_size = 4;
|
|
|
|
let chunk_size = 16;
|
|
|
|
let chunks = a.len() / chunk_size;
|
|
|
|
for i in 0..chunks {
|
|
|
|
let a = Simd::<f32, 4>::from_slice(&a[i * chunk_size..]);
|
|
|
|
let b = Simd::<f32, 4>::from_slice(&b[i * chunk_size..]);
|
|
|
|
let a = Simd::<f32, 16>::from_slice(&a[i * chunk_size..]);
|
|
|
|
let b = Simd::<f32, 16>::from_slice(&b[i * chunk_size..]);
|
|
|
|
let c = a - b;
|
|
|
|
result.extend_from_slice(c.as_array());
|
|
|
|
}
|
|
|
@ -61,11 +61,11 @@ pub fn mul_f32_tensor_simd(a: &[f32], b: &[f32]) -> Vec<f32> { |
|
|
|
if a.is_empty() {
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
let chunk_size = 4;
|
|
|
|
let chunk_size = 16;
|
|
|
|
let chunks = a.len() / chunk_size;
|
|
|
|
for i in 0..chunks {
|
|
|
|
let a = Simd::<f32, 4>::from_slice(&a[i * chunk_size..]);
|
|
|
|
let b = Simd::<f32, 4>::from_slice(&b[i * chunk_size..]);
|
|
|
|
let a = Simd::<f32, 16>::from_slice(&a[i * chunk_size..]);
|
|
|
|
let b = Simd::<f32, 16>::from_slice(&b[i * chunk_size..]);
|
|
|
|
let c = a * b;
|
|
|
|
result.extend_from_slice(c.as_array());
|
|
|
|
}
|
|
|
@ -81,11 +81,11 @@ pub fn div_f32_tensor_simd(a: &[f32], b: &[f32]) -> Vec<f32> { |
|
|
|
if a.is_empty() {
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
let chunk_size = 4;
|
|
|
|
let chunk_size = 16;
|
|
|
|
let chunks = a.len() / chunk_size;
|
|
|
|
for i in 0..chunks {
|
|
|
|
let a = Simd::<f32, 4>::from_slice(&a[i * chunk_size..]);
|
|
|
|
let b = Simd::<f32, 4>::from_slice(&b[i * chunk_size..]);
|
|
|
|
let a = Simd::<f32, 16>::from_slice(&a[i * chunk_size..]);
|
|
|
|
let b = Simd::<f32, 16>::from_slice(&b[i * chunk_size..]);
|
|
|
|
let c = a / b;
|
|
|
|
result.extend_from_slice(c.as_array());
|
|
|
|
}
|
|
|
@ -110,7 +110,7 @@ pub fn matmul_f32_tensor_simd( |
|
|
|
let n = b_shape[1];
|
|
|
|
let k = a_shape[1];
|
|
|
|
|
|
|
|
let chunk_size = 4;
|
|
|
|
let chunk_size = 16;
|
|
|
|
let chunks = k / chunk_size;
|
|
|
|
|
|
|
|
let mut result = vec![0.0; m * n];
|
|
|
@ -121,10 +121,10 @@ pub fn matmul_f32_tensor_simd( |
|
|
|
}
|
|
|
|
for i in 0..m {
|
|
|
|
let arow = &a[i * k..(i + 1) * k];
|
|
|
|
let mut sum = Simd::<f32, 4>::splat(0.0);
|
|
|
|
let mut sum = Simd::<f32, 16>::splat(0.0);
|
|
|
|
for chunk in 0..chunks {
|
|
|
|
let a = Simd::<f32, 4>::from_slice(&arow[chunk * chunk_size..]);
|
|
|
|
let b = Simd::<f32, 4>::from_slice(&bcol[chunk * chunk_size..]);
|
|
|
|
let a = Simd::<f32, 16>::from_slice(&arow[chunk * chunk_size..]);
|
|
|
|
let b = Simd::<f32, 16>::from_slice(&bcol[chunk * chunk_size..]);
|
|
|
|
sum += a * b;
|
|
|
|
}
|
|
|
|
let mut res = sum.as_array().iter().sum::<f32>();
|
|
|
|