Why/How is Circshift so fast?
I have the following function in Matlab:
function [RawDataKK]=DataCompressKKTest(Data,RXangle,s)
tSize=size(Data,1);
xSize=size(Data,2);
TXSize=size(Data,3);
RXSize=numel(RXangle);
RawDataKK=zeros(tSize,TXSize,RXSize);
DataTemp=zeros(tSize,xSize);
slope = s*sin(RXangle(:))/2;
nShift = round(slope.*((0:xSize-1) – (xSize-1)*(1-sign(RXangle(:)))/2));
for nR=1:RXSize
for nT=1:TXSize
for nx=1:xSize
DataTemp(:,nx)=circshift(Data(:,nx,nT),nShift(nR,nx));
end
RawDataKK(:,nT,nR)=sum(DataTemp,2);
end
end
end
I tried implementing it as a C++ function using the Eigen toolbox and the Matlab Data API for C++. The basic logic of the C++ implementation uses a reindexing approach to implement circshift.
“` lang-cpp
#include <iostream>
#include "mex.hpp"
#include "mexAdapter.hpp"
#include <Eigen/Dense>
#include <MatlabDataArray.hpp>
#include <cmath>
#include <math.h>
#include <omp.h>
class MexFunction : public matlab::mex::Function {
public:
// Factory to create MATLAB data arrays
matlab::data::ArrayFactory factory;
void operator()(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
checkArguments(outputs, inputs);
// Initialize input parameters
int tSize = inputs[0][0];
int xSize = inputs[0][1];
int TXSize = inputs[0][2];
int RXSize = inputs[0][3];
int RFlen = inputs[0][4];
auto ptr = getDataPtr<std::complex<double>>(inputs[1]);
Eigen::Map< const Eigen::MatrixXcd > Data( ptr, RFlen, xSize );
double s = inputs[2][0];
auto ptr2 = getDataPtr<double>(inputs[3]);
Eigen::Map< const Eigen::VectorXd > RXangle( ptr2, RXSize );
auto ptr3 = getDataPtr<int32_t>(inputs[4]);
Eigen::Map< const Eigen::MatrixXi > indices( ptr3, tSize, xSize*RXSize );
outputs[0] = factory.createArray<std::complex<double>>({static_cast<size_t>(tSize),static_cast<size_t>(TXSize*RXSize)});
auto ptrRecon = getOutDataPtr<std::complex<double>>(outputs[0]);
Eigen::Map<Eigen::MatrixXcd> RFDataKK(ptrRecon,tSize,TXSize*RXSize);
// Get num threads
int numThreads = omp_get_max_threads();
int nProc = omp_get_num_procs();
omp_set_num_threads(nProc*2);
#pragma omp parallel for
for (int nR = 0; nR < RXSize; nR++) {
Eigen::MatrixXcd shiftedData = Eigen::MatrixXcd::Zero(tSize*TXSize,xSize);
for (int nT = 0; nT < TXSize; nT++){
extractData(shiftedData(Eigen::seq(nT*tSize,(nT+1)*tSize-1),Eigen::all),
Data(Eigen::seq(nT*tSize,(nT+1)*tSize-1),Eigen::all),
indices(Eigen::all,Eigen::seq(nR*xSize, (nR+1)*xSize-1)), xSize, tSize);
}
RFDataKK(Eigen::all,Eigen::seq(nR*TXSize,(nR+1)*TXSize-1) ) = shiftedData.rowwise().sum().reshaped(tSize,TXSize);
}
}
void extractData(Eigen::Ref<Eigen::MatrixXcd> shiftedData, const Eigen::Ref<const Eigen::MatrixXcd>& Data,
const Eigen::Ref<const Eigen::MatrixXi>& indices, const long int& xSize, const long int& tSize) {
for (int i = 0; i < tSize*xSize; i++) {
int nt = i % tSize;
int nx = i / tSize;
int row = indices(nt,nx) % tSize;
int col = indices(nt,nx) / tSize;
shiftedData(row,col) = Data(nt,nx);
}
}
void checkArguments(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
std::shared_ptr<matlab::engine::MATLABEngine> matlabPtr = getEngine();
matlab::data::ArrayFactory factory;
// TODO: Implement remaining checks
}
template <typename T>
const T* getDataPtr(matlab::data::Array arr) {
const matlab::data::TypedArray<T> arr_t = arr;
matlab::data::TypedIterator<const T> it(arr_t.begin());
return it.operator->();
}
template <typename T>
T* getOutDataPtr(matlab::data::Array& arr) {
auto range = matlab::data::getWritableElements<T>(arr);
return range.begin().operator->();
}
};
“`
Where I precompute the values of indices using the following Matlab function:
function linIndices=DataCompressKKIndices(RXangle,s,tSize,xSize,TXSize,RXSize)
shiftedData = zeros(tSize, xSize, TXSize);
RFDatKK = zeros(tSize,TXSize,RXSize);
RFDataKK = zeros(tSize,xSize*RXSize);
slope=s*sin(RXangle(:))/2;
nShift = round(slope.*((0:xSize-1) – (xSize-1)*(1-sign(RXangle(:)))/2));
indices = mod((0:tSize-1).’ + reshape(nShift.’,[],1).’,tSize);
linIndices = indices + repmat((0:xSize-1)*tSize,1,RXSize);
end
When I perform a speed test in Matlab after compiling with the following parameters (with mingw GCC version 8.3), I find that I have not gained a meaningful speedup for the array sizes I am working with. OpenMP parallelization should at least give me an order of magnitude speed up (I have 24 threads on my machine). There are two questions that yields:
1. Why is a reindexing approach slower than doing circshift? A reindexing approach in Matlab (not shown here) is almost 2x slower than using nested for loops and circshift.
2. What exactly could circshift be doing under the hood that is so efficient? Is there some funky pointer arithmetic that could accomplish the functionality of circshift?
Compilation and testing code:
%% Compilation
mingwFlags = {‘CXXFLAGS="$CXXFLAGS -march=native -std=c++14 -fno-math-errno -ffast-math -fopenmp -DNDEBUG -w -Wno-error"’,…
‘LDFLAGS="$LDFLAGS -fopenmp"’,’CXXOPTIMFLAGS="-O3"’};
% ipath is the path to the eigen library.
tic; mex(ipath,mingwFlags{1},mingwFlags{2},mingwFlags{3},’CompressKK.cpp’); toc
tic; mex(ipath,mingwFlags{1},mingwFlags{2},mingwFlags{3},’CompressKKIndices.cpp’); toc
%% Initialize some random data
…
% tSize = 2688, xSize = 192, TXSize = 15, RXSize = 16 for trial cases
%% Timing Test
nTrials = 100;
T = zeros(nTrials,4);
for i = 1:nTrials
tic; indices = DataCompressKKIndices(p.RXangle,s,double(p.szRFframe+1),double(p.numEl),p.na,p.nRX);
RawDataKK3 = CompressKKIndices(param,cRF(1:param(5),p.ConnMap),s,p.RXangle,int32(indices));
RawDataKK3 = reshape(RawDataKK3,[p.szRFframe+1,p.na,p.nRX]); T(i,1) = toc;
tic; RawDataKK=DataCompressKKTest(Data,p.RXangle,s); T(i,2) = toc;
end
chkT = mean(T,1)
Timing Results:
chkT =
C++ version: 0.4867 0.8299I have the following function in Matlab:
function [RawDataKK]=DataCompressKKTest(Data,RXangle,s)
tSize=size(Data,1);
xSize=size(Data,2);
TXSize=size(Data,3);
RXSize=numel(RXangle);
RawDataKK=zeros(tSize,TXSize,RXSize);
DataTemp=zeros(tSize,xSize);
slope = s*sin(RXangle(:))/2;
nShift = round(slope.*((0:xSize-1) – (xSize-1)*(1-sign(RXangle(:)))/2));
for nR=1:RXSize
for nT=1:TXSize
for nx=1:xSize
DataTemp(:,nx)=circshift(Data(:,nx,nT),nShift(nR,nx));
end
RawDataKK(:,nT,nR)=sum(DataTemp,2);
end
end
end
I tried implementing it as a C++ function using the Eigen toolbox and the Matlab Data API for C++. The basic logic of the C++ implementation uses a reindexing approach to implement circshift.
“` lang-cpp
#include <iostream>
#include "mex.hpp"
#include "mexAdapter.hpp"
#include <Eigen/Dense>
#include <MatlabDataArray.hpp>
#include <cmath>
#include <math.h>
#include <omp.h>
class MexFunction : public matlab::mex::Function {
public:
// Factory to create MATLAB data arrays
matlab::data::ArrayFactory factory;
void operator()(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
checkArguments(outputs, inputs);
// Initialize input parameters
int tSize = inputs[0][0];
int xSize = inputs[0][1];
int TXSize = inputs[0][2];
int RXSize = inputs[0][3];
int RFlen = inputs[0][4];
auto ptr = getDataPtr<std::complex<double>>(inputs[1]);
Eigen::Map< const Eigen::MatrixXcd > Data( ptr, RFlen, xSize );
double s = inputs[2][0];
auto ptr2 = getDataPtr<double>(inputs[3]);
Eigen::Map< const Eigen::VectorXd > RXangle( ptr2, RXSize );
auto ptr3 = getDataPtr<int32_t>(inputs[4]);
Eigen::Map< const Eigen::MatrixXi > indices( ptr3, tSize, xSize*RXSize );
outputs[0] = factory.createArray<std::complex<double>>({static_cast<size_t>(tSize),static_cast<size_t>(TXSize*RXSize)});
auto ptrRecon = getOutDataPtr<std::complex<double>>(outputs[0]);
Eigen::Map<Eigen::MatrixXcd> RFDataKK(ptrRecon,tSize,TXSize*RXSize);
// Get num threads
int numThreads = omp_get_max_threads();
int nProc = omp_get_num_procs();
omp_set_num_threads(nProc*2);
#pragma omp parallel for
for (int nR = 0; nR < RXSize; nR++) {
Eigen::MatrixXcd shiftedData = Eigen::MatrixXcd::Zero(tSize*TXSize,xSize);
for (int nT = 0; nT < TXSize; nT++){
extractData(shiftedData(Eigen::seq(nT*tSize,(nT+1)*tSize-1),Eigen::all),
Data(Eigen::seq(nT*tSize,(nT+1)*tSize-1),Eigen::all),
indices(Eigen::all,Eigen::seq(nR*xSize, (nR+1)*xSize-1)), xSize, tSize);
}
RFDataKK(Eigen::all,Eigen::seq(nR*TXSize,(nR+1)*TXSize-1) ) = shiftedData.rowwise().sum().reshaped(tSize,TXSize);
}
}
void extractData(Eigen::Ref<Eigen::MatrixXcd> shiftedData, const Eigen::Ref<const Eigen::MatrixXcd>& Data,
const Eigen::Ref<const Eigen::MatrixXi>& indices, const long int& xSize, const long int& tSize) {
for (int i = 0; i < tSize*xSize; i++) {
int nt = i % tSize;
int nx = i / tSize;
int row = indices(nt,nx) % tSize;
int col = indices(nt,nx) / tSize;
shiftedData(row,col) = Data(nt,nx);
}
}
void checkArguments(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
std::shared_ptr<matlab::engine::MATLABEngine> matlabPtr = getEngine();
matlab::data::ArrayFactory factory;
// TODO: Implement remaining checks
}
template <typename T>
const T* getDataPtr(matlab::data::Array arr) {
const matlab::data::TypedArray<T> arr_t = arr;
matlab::data::TypedIterator<const T> it(arr_t.begin());
return it.operator->();
}
template <typename T>
T* getOutDataPtr(matlab::data::Array& arr) {
auto range = matlab::data::getWritableElements<T>(arr);
return range.begin().operator->();
}
};
“`
Where I precompute the values of indices using the following Matlab function:
function linIndices=DataCompressKKIndices(RXangle,s,tSize,xSize,TXSize,RXSize)
shiftedData = zeros(tSize, xSize, TXSize);
RFDatKK = zeros(tSize,TXSize,RXSize);
RFDataKK = zeros(tSize,xSize*RXSize);
slope=s*sin(RXangle(:))/2;
nShift = round(slope.*((0:xSize-1) – (xSize-1)*(1-sign(RXangle(:)))/2));
indices = mod((0:tSize-1).’ + reshape(nShift.’,[],1).’,tSize);
linIndices = indices + repmat((0:xSize-1)*tSize,1,RXSize);
end
When I perform a speed test in Matlab after compiling with the following parameters (with mingw GCC version 8.3), I find that I have not gained a meaningful speedup for the array sizes I am working with. OpenMP parallelization should at least give me an order of magnitude speed up (I have 24 threads on my machine). There are two questions that yields:
1. Why is a reindexing approach slower than doing circshift? A reindexing approach in Matlab (not shown here) is almost 2x slower than using nested for loops and circshift.
2. What exactly could circshift be doing under the hood that is so efficient? Is there some funky pointer arithmetic that could accomplish the functionality of circshift?
Compilation and testing code:
%% Compilation
mingwFlags = {‘CXXFLAGS="$CXXFLAGS -march=native -std=c++14 -fno-math-errno -ffast-math -fopenmp -DNDEBUG -w -Wno-error"’,…
‘LDFLAGS="$LDFLAGS -fopenmp"’,’CXXOPTIMFLAGS="-O3"’};
% ipath is the path to the eigen library.
tic; mex(ipath,mingwFlags{1},mingwFlags{2},mingwFlags{3},’CompressKK.cpp’); toc
tic; mex(ipath,mingwFlags{1},mingwFlags{2},mingwFlags{3},’CompressKKIndices.cpp’); toc
%% Initialize some random data
…
% tSize = 2688, xSize = 192, TXSize = 15, RXSize = 16 for trial cases
%% Timing Test
nTrials = 100;
T = zeros(nTrials,4);
for i = 1:nTrials
tic; indices = DataCompressKKIndices(p.RXangle,s,double(p.szRFframe+1),double(p.numEl),p.na,p.nRX);
RawDataKK3 = CompressKKIndices(param,cRF(1:param(5),p.ConnMap),s,p.RXangle,int32(indices));
RawDataKK3 = reshape(RawDataKK3,[p.szRFframe+1,p.na,p.nRX]); T(i,1) = toc;
tic; RawDataKK=DataCompressKKTest(Data,p.RXangle,s); T(i,2) = toc;
end
chkT = mean(T,1)
Timing Results:
chkT =
C++ version: 0.4867 0.8299 I have the following function in Matlab:
function [RawDataKK]=DataCompressKKTest(Data,RXangle,s)
tSize=size(Data,1);
xSize=size(Data,2);
TXSize=size(Data,3);
RXSize=numel(RXangle);
RawDataKK=zeros(tSize,TXSize,RXSize);
DataTemp=zeros(tSize,xSize);
slope = s*sin(RXangle(:))/2;
nShift = round(slope.*((0:xSize-1) – (xSize-1)*(1-sign(RXangle(:)))/2));
for nR=1:RXSize
for nT=1:TXSize
for nx=1:xSize
DataTemp(:,nx)=circshift(Data(:,nx,nT),nShift(nR,nx));
end
RawDataKK(:,nT,nR)=sum(DataTemp,2);
end
end
end
I tried implementing it as a C++ function using the Eigen toolbox and the Matlab Data API for C++. The basic logic of the C++ implementation uses a reindexing approach to implement circshift.
“` lang-cpp
#include <iostream>
#include "mex.hpp"
#include "mexAdapter.hpp"
#include <Eigen/Dense>
#include <MatlabDataArray.hpp>
#include <cmath>
#include <math.h>
#include <omp.h>
class MexFunction : public matlab::mex::Function {
public:
// Factory to create MATLAB data arrays
matlab::data::ArrayFactory factory;
void operator()(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
checkArguments(outputs, inputs);
// Initialize input parameters
int tSize = inputs[0][0];
int xSize = inputs[0][1];
int TXSize = inputs[0][2];
int RXSize = inputs[0][3];
int RFlen = inputs[0][4];
auto ptr = getDataPtr<std::complex<double>>(inputs[1]);
Eigen::Map< const Eigen::MatrixXcd > Data( ptr, RFlen, xSize );
double s = inputs[2][0];
auto ptr2 = getDataPtr<double>(inputs[3]);
Eigen::Map< const Eigen::VectorXd > RXangle( ptr2, RXSize );
auto ptr3 = getDataPtr<int32_t>(inputs[4]);
Eigen::Map< const Eigen::MatrixXi > indices( ptr3, tSize, xSize*RXSize );
outputs[0] = factory.createArray<std::complex<double>>({static_cast<size_t>(tSize),static_cast<size_t>(TXSize*RXSize)});
auto ptrRecon = getOutDataPtr<std::complex<double>>(outputs[0]);
Eigen::Map<Eigen::MatrixXcd> RFDataKK(ptrRecon,tSize,TXSize*RXSize);
// Get num threads
int numThreads = omp_get_max_threads();
int nProc = omp_get_num_procs();
omp_set_num_threads(nProc*2);
#pragma omp parallel for
for (int nR = 0; nR < RXSize; nR++) {
Eigen::MatrixXcd shiftedData = Eigen::MatrixXcd::Zero(tSize*TXSize,xSize);
for (int nT = 0; nT < TXSize; nT++){
extractData(shiftedData(Eigen::seq(nT*tSize,(nT+1)*tSize-1),Eigen::all),
Data(Eigen::seq(nT*tSize,(nT+1)*tSize-1),Eigen::all),
indices(Eigen::all,Eigen::seq(nR*xSize, (nR+1)*xSize-1)), xSize, tSize);
}
RFDataKK(Eigen::all,Eigen::seq(nR*TXSize,(nR+1)*TXSize-1) ) = shiftedData.rowwise().sum().reshaped(tSize,TXSize);
}
}
void extractData(Eigen::Ref<Eigen::MatrixXcd> shiftedData, const Eigen::Ref<const Eigen::MatrixXcd>& Data,
const Eigen::Ref<const Eigen::MatrixXi>& indices, const long int& xSize, const long int& tSize) {
for (int i = 0; i < tSize*xSize; i++) {
int nt = i % tSize;
int nx = i / tSize;
int row = indices(nt,nx) % tSize;
int col = indices(nt,nx) / tSize;
shiftedData(row,col) = Data(nt,nx);
}
}
void checkArguments(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
std::shared_ptr<matlab::engine::MATLABEngine> matlabPtr = getEngine();
matlab::data::ArrayFactory factory;
// TODO: Implement remaining checks
}
template <typename T>
const T* getDataPtr(matlab::data::Array arr) {
const matlab::data::TypedArray<T> arr_t = arr;
matlab::data::TypedIterator<const T> it(arr_t.begin());
return it.operator->();
}
template <typename T>
T* getOutDataPtr(matlab::data::Array& arr) {
auto range = matlab::data::getWritableElements<T>(arr);
return range.begin().operator->();
}
};
“`
Where I precompute the values of indices using the following Matlab function:
function linIndices=DataCompressKKIndices(RXangle,s,tSize,xSize,TXSize,RXSize)
shiftedData = zeros(tSize, xSize, TXSize);
RFDatKK = zeros(tSize,TXSize,RXSize);
RFDataKK = zeros(tSize,xSize*RXSize);
slope=s*sin(RXangle(:))/2;
nShift = round(slope.*((0:xSize-1) – (xSize-1)*(1-sign(RXangle(:)))/2));
indices = mod((0:tSize-1).’ + reshape(nShift.’,[],1).’,tSize);
linIndices = indices + repmat((0:xSize-1)*tSize,1,RXSize);
end
When I perform a speed test in Matlab after compiling with the following parameters (with mingw GCC version 8.3), I find that I have not gained a meaningful speedup for the array sizes I am working with. OpenMP parallelization should at least give me an order of magnitude speed up (I have 24 threads on my machine). There are two questions that yields:
1. Why is a reindexing approach slower than doing circshift? A reindexing approach in Matlab (not shown here) is almost 2x slower than using nested for loops and circshift.
2. What exactly could circshift be doing under the hood that is so efficient? Is there some funky pointer arithmetic that could accomplish the functionality of circshift?
Compilation and testing code:
%% Compilation
mingwFlags = {‘CXXFLAGS="$CXXFLAGS -march=native -std=c++14 -fno-math-errno -ffast-math -fopenmp -DNDEBUG -w -Wno-error"’,…
‘LDFLAGS="$LDFLAGS -fopenmp"’,’CXXOPTIMFLAGS="-O3"’};
% ipath is the path to the eigen library.
tic; mex(ipath,mingwFlags{1},mingwFlags{2},mingwFlags{3},’CompressKK.cpp’); toc
tic; mex(ipath,mingwFlags{1},mingwFlags{2},mingwFlags{3},’CompressKKIndices.cpp’); toc
%% Initialize some random data
…
% tSize = 2688, xSize = 192, TXSize = 15, RXSize = 16 for trial cases
%% Timing Test
nTrials = 100;
T = zeros(nTrials,4);
for i = 1:nTrials
tic; indices = DataCompressKKIndices(p.RXangle,s,double(p.szRFframe+1),double(p.numEl),p.na,p.nRX);
RawDataKK3 = CompressKKIndices(param,cRF(1:param(5),p.ConnMap),s,p.RXangle,int32(indices));
RawDataKK3 = reshape(RawDataKK3,[p.szRFframe+1,p.na,p.nRX]); T(i,1) = toc;
tic; RawDataKK=DataCompressKKTest(Data,p.RXangle,s); T(i,2) = toc;
end
chkT = mean(T,1)
Timing Results:
chkT =
C++ version: 0.4867 0.8299 performance, mex, c++, optimization MATLAB Answers — New Questions