Dlaczego wskaźniki mają większą wydajność? Test wydajności funkcji openCV.

1

Ostatnio zrobiłem test szybkości funkcji, które wykonują obliczenia na każdym pixelu zdjęcia. Tutaj git.
Kod:

#include <opencv2/opencv.hpp>
#include <string>
#include <cstdlib>

#include "TimerBase.h"

using std::string;
using namespace cv;

// window title
const string window_title = "Window";

void NegativeDirectMemoryAccess(Mat& img)
{
	if (img.isContinuous())
	{
		int numberOfChannels = img.channels();
		int numberOfPixels = img.rows * img.cols * numberOfChannels;
		uchar* p = img.data;
		for (int i = 0; i < numberOfPixels; i = i + numberOfChannels)
		{
			for (int j = 0; j < numberOfChannels; ++j)
			{
				p[i + j] = 255 - p[i + j];
			}
		}
	}
	else
	{
		std::cout << "Mat is not continous" << std::endl;
	}
}

void NegativeIterator(Mat& img)
{
	int numberOfChannels = img.channels();

	if (numberOfChannels == 1)
	{
		MatIterator_<uchar> it, end;
		for (it = img.begin<uchar>(), end = img.end<uchar>(); it != end; ++it)
		{
			*it = 255 - *it;
		}
	}
	else if (numberOfChannels == 3)
	{
		MatIterator_<Vec<uchar, 3>> it, end;
		for (it = img.begin<Vec<uchar, 3>>(), end = img.end<Vec<uchar, 3>>(); it != end; ++it)
		{
			// there are two approaches to get Vec values:
			// through val filed
			(*it).val[0] = 255 - (*it).val[0];
			// or directly
			(*it)[1] = 255 - (*it)[1];

			(*it).val[2] = 255 - (*it).val[2];
		}
	}
	else
	{
		std::cout << "Unsuported number of channels" << std::endl;
	}
}

void NegativeRandomAccess(Mat& img)
{
	// this function works only for color images
	typedef Vec<uchar, 3> Pixel;

	int rows = img.rows;
	int cols = img.cols;

	for (int i = 0; i < rows; ++i)
	{
		for (int j = 0; j < cols; ++j)
		{
			img.at<Pixel>(i, j).val[0] = 255 - img.at<Pixel>(i, j).val[0];
			img.at<Pixel>(i, j).val[1] = 255 - img.at<Pixel>(i, j).val[1];
			img.at<Pixel>(i, j).val[2] = 255 - img.at<Pixel>(i, j).val[2];
		}
	}
}

void NegativRawPtrAccess(Mat& img)
{
	// this function works only for color images
	typedef Vec<uchar, 3> Pixel;

	for (int r = 0; r < img.rows; ++r)
	{
		Pixel* ptr = img.ptr<Pixel>(r, 0);
		const Pixel* ptr_end = ptr + img.cols;
		for (; ptr != ptr_end; ++ptr)
		{
			ptr->val[0] = 255 - ptr->val[0];
			ptr->val[1] = 255 - ptr->val[1];
			ptr->val[2] = 255 - ptr->val[2];
		}
	}
}

void NegativeMatIterator(Mat& img)
{
	// this function works only for color images
	typedef Vec<uchar, 3> Pixel;

	for (Pixel &p : cv::Mat_<Pixel>(img))
	{
		p.val[0] = 255 - p.val[0];
		p.val[1] = 255 - p.val[1];
		p.val[2] = 255 - p.val[2];
	}
}

void NegativeLambda(Mat& img)
{
	// this function works only for color images
	typedef Vec<uchar, 3> Pixel;

	// using C++11 lambda.
	img.forEach<Pixel>([](Pixel &p, const int * position) -> void {
		p.val[0] = 255 - p.val[0];
		p.val[1] = 255 - p.val[1];
		p.val[2] = 255 - p.val[2];
	});
}

unsigned long check_time(cv::Mat mat, void (*func)(cv::Mat&), const int calls)
{
	cTimer timer;
	timer.start();

	for (int i = 0; i < calls; i++)
	{
		cv::Mat m = mat.clone();
		func(m);
	}

	return timer.getTime(cTimer::timeUnit::Microseconds) / calls;
}

int main()
{ 
	cv::Mat img = cv::imread("lena.jpg");
	cTimer timer;

	const int funcCalls = 100;
	std::cout << "NegativeDirectMemoryAccess: " << check_time(img, NegativeDirectMemoryAccess, funcCalls) << "\n";
	std::cout << "NegativeIterator: " << check_time(img, NegativeIterator, funcCalls) << "\n";
	std::cout << "NegativeRandomAccess: " << check_time(img, NegativeRandomAccess, funcCalls) << "\n";
	std::cout << "NegativRawPtrAccess: " << check_time(img, NegativRawPtrAccess, funcCalls) << "\n";
	std::cout << "NegativeMatIterator: " << check_time(img, NegativeMatIterator, funcCalls) << "\n";
	std::cout << "NegativeLambda: " << check_time(img, NegativeLambda, funcCalls) << "\n";

	while (true) 
	{
		cv::namedWindow(window_title);
		cv::imshow(window_title, img);
		cv::waitKey();
	}
	
	std::cin.get();
	return 0;
}

No i w zasadzie mam pytania co do tych dwóch funkcji:
Dlaczego jest aż tak kolosalna różnica między nimi?
NegativRawPtrAccess: 947 microsec
NegativeMatIterator: 18680 microsec

0

Zapewne SIMD wskakuje w opcji NegativRawPtrAccess co niespecjalnie jest możliwe w NegativeMatIterator.

0

Na moim MacBook Air

g++ main.cpp -o task2 -std=c++17 `pkg-config --cflags --libs opencv`

Bez optymalizacji:

NegativeDirectMemoryAccess: 2322
NegativeIterator: 7584
NegativeRandomAccess: 6584
NegativRawPtrAccess: 1012
NegativeMatIterator: 4455
NegativeLambda: 1450

z -O1

NegativeDirectMemoryAccess: 649
NegativeIterator: 5246
NegativeRandomAccess: 3113
NegativRawPtrAccess: 392
NegativeMatIterator: 2275
NegativeLambda: 927

z -O2

NegativeDirectMemoryAccess: 826
NegativeIterator: 941
NegativeRandomAccess: 963
NegativRawPtrAccess: 381
NegativeMatIterator: 933
NegativeLambda: 361

z -O3:

NegativeDirectMemoryAccess: 881
NegativeIterator: 963
NegativeRandomAccess: 1002
NegativRawPtrAccess: 422
NegativeMatIterator: 956
NegativeLambda: 700

Jak widać najdramatyczniejsza różnica wychodzi dla -O1.
Ciekawe z czego to wynika.

1

-O2

	.globl	__Z19NegativRawPtrAccessRN2cv3MatE ## -- Begin function _Z19NegativRawPtrAccessRN2cv3MatE
	.p2align	4, 0x90
__Z19NegativRawPtrAccessRN2cv3MatE:     ## @_Z19NegativRawPtrAccessRN2cv3MatE
	.cfi_startproc
## %bb.0:
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
	movl	8(%rdi), %ecx
	testl	%ecx, %ecx
	jle	LBB5_7
## %bb.1:
	xorl	%eax, %eax
	.p2align	4, 0x90
LBB5_2:                                 ## =>This Loop Header: Depth=1
                                        ##     Child Loop BB5_4 Depth 2
	movslq	12(%rdi), %rdx
	leaq	(%rdx,%rdx,2), %rdx
	testq	%rdx, %rdx
	je	LBB5_6
## %bb.3:                               ##   in Loop: Header=BB5_2 Depth=1
	movq	72(%rdi), %rcx
	movq	(%rcx), %rcx
	imulq	%rax, %rcx
	addq	16(%rdi), %rcx
	addq	%rcx, %rdx
	.p2align	4, 0x90
LBB5_4:                                 ##   Parent Loop BB5_2 Depth=1
                                        ## =>  This Inner Loop Header: Depth=2
	notb	(%rcx)
	notb	1(%rcx)
	notb	2(%rcx)
	addq	$3, %rcx
	cmpq	%rdx, %rcx
	jne	LBB5_4
## %bb.5:                               ##   in Loop: Header=BB5_2 Depth=1
	movl	8(%rdi), %ecx
LBB5_6:                                 ##   in Loop: Header=BB5_2 Depth=1
	incq	%rax
	movslq	%ecx, %rdx
	cmpq	%rdx, %rax
	jl	LBB5_2
LBB5_7:
	popq	%rbp
	retq
	.cfi_endproc
                                        ## -- End function
	.globl	__Z19NegativeMatIteratorRN2cv3MatE ## -- Begin function _Z19NegativeMatIteratorRN2cv3MatE
	.p2align	4, 0x90
__Z19NegativeMatIteratorRN2cv3MatE:     ## @_Z19NegativeMatIteratorRN2cv3MatE
Lfunc_begin2:
	.cfi_startproc
	.cfi_personality 155, ___gxx_personality_v0
	.cfi_lsda 16, Lexception2
## %bb.0:
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
	pushq	%r15
	pushq	%r14
	pushq	%rbx
	subq	$184, %rsp
	.cfi_offset %rbx, -40
	.cfi_offset %r14, -32
	.cfi_offset %r15, -24
	leaq	-112(%rbp), %rax
	movl	$0, -60(%rbp)
	movq	$0, -68(%rbp)
	movq	$0, -76(%rbp)
	movq	$0, -84(%rbp)
	movq	$0, -92(%rbp)
	movq	$0, -100(%rbp)
	movq	$0, -108(%rbp)
	movq	$0, -116(%rbp)
	movq	%rax, -56(%rbp)
	leaq	-40(%rbp), %r14
	movq	%r14, -48(%rbp)
	movq	%rdi, %rax
	movq	$0, -32(%rbp)
	movq	$0, -40(%rbp)
	movl	$1124007952, -120(%rbp) ## imm = 0x42FF0010
Ltmp10:
	leaq	-120(%rbp), %r15
	movq	%r15, %rdi
	movq	%rax, %rsi
	callq	__ZN2cv4Mat_INS_3VecIhLi3EEEEaSERKNS_3MatE
Ltmp11:
## %bb.1:
	movq	%r15, -200(%rbp)
	movslq	-116(%rbp), %rcx
	testq	%rcx, %rcx
	jle	LBB6_2
## %bb.11:
	movq	-48(%rbp), %rax
	movq	-8(%rax,%rcx,8), %r8
	jmp	LBB6_12
LBB6_2:
	xorl	%r8d, %r8d
LBB6_12:
	movq	%r8, -192(%rbp)
	movq	$0, -168(%rbp)
	movq	$0, -176(%rbp)
	movq	$0, -184(%rbp)
	testb	$64, -119(%rbp)
	je	LBB6_23
## %bb.13:
	movq	-104(%rbp), %r9
	movq	%r9, -176(%rbp)
	cmpl	$2, %ecx
	jg	LBB6_15
## %bb.14:
	movslq	-112(%rbp), %rax
	movslq	-108(%rbp), %rsi
	imulq	%rax, %rsi
	jmp	LBB6_22
LBB6_15:
	movq	-56(%rbp), %rdi
	movl	%ecx, %r10d
	leaq	-1(%r10), %rsi
	andl	$7, %ecx
	cmpq	$7, %rsi
	jae	LBB6_17
## %bb.16:
	movl	$1, %esi
	xorl	%ebx, %ebx
	testq	%rcx, %rcx
	jne	LBB6_20
	jmp	LBB6_22
LBB6_17:
	subq	%rcx, %r10
	movl	$1, %esi
	xorl	%ebx, %ebx
	.p2align	4, 0x90
LBB6_18:                                ## =>This Inner Loop Header: Depth=1
	movslq	(%rdi,%rbx,4), %rdx
	imulq	%rsi, %rdx
	movslq	4(%rdi,%rbx,4), %rsi
	movslq	8(%rdi,%rbx,4), %rax
	imulq	%rsi, %rax
	imulq	%rdx, %rax
	movslq	12(%rdi,%rbx,4), %rdx
	movslq	16(%rdi,%rbx,4), %rsi
	imulq	%rdx, %rsi
	movslq	20(%rdi,%rbx,4), %rdx
	imulq	%rsi, %rdx
	imulq	%rax, %rdx
	movslq	24(%rdi,%rbx,4), %rax
	movslq	28(%rdi,%rbx,4), %rsi
	imulq	%rax, %rsi
	imulq	%rdx, %rsi
	addq	$8, %rbx
	cmpq	%rbx, %r10
	jne	LBB6_18
## %bb.19:
	testq	%rcx, %rcx
	je	LBB6_22
LBB6_20:
	leaq	(%rdi,%rbx,4), %rax
	negq	%rcx
	.p2align	4, 0x90
LBB6_21:                                ## =>This Inner Loop Header: Depth=1
	movslq	(%rax), %rdx
	imulq	%rdx, %rsi
	addq	$4, %rax
	incq	%rcx
	jne	LBB6_21
LBB6_22:
	imulq	%r8, %rsi
	addq	%rsi, %r9
	movq	%r9, -168(%rbp)
LBB6_23:
Ltmp18:
	leaq	-200(%rbp), %rdi
	xorl	%esi, %esi
	xorl	%edx, %edx
	callq	__ZN2cv16MatConstIterator4seekEPKib
Ltmp19:
## %bb.24:
	movq	%r15, -160(%rbp)
	movslq	-116(%rbp), %rcx
	testq	%rcx, %rcx
	jle	LBB6_25
## %bb.26:
	movq	-48(%rbp), %rax
	movq	-8(%rax,%rcx,8), %r8
	jmp	LBB6_27
LBB6_25:
	xorl	%r8d, %r8d
LBB6_27:
	movq	%r8, -152(%rbp)
	movq	$0, -128(%rbp)
	movq	$0, -136(%rbp)
	movq	$0, -144(%rbp)
	testb	$64, -119(%rbp)
	je	LBB6_38
## %bb.28:
	movq	-104(%rbp), %r9
	movq	%r9, -136(%rbp)
	cmpl	$2, %ecx
	jg	LBB6_30
## %bb.29:
	movslq	-112(%rbp), %rax
	movslq	-108(%rbp), %rsi
	imulq	%rax, %rsi
	jmp	LBB6_37
LBB6_30:
	movq	-56(%rbp), %rdi
	movl	%ecx, %r10d
	leaq	-1(%r10), %rsi
	andl	$7, %ecx
	cmpq	$7, %rsi
	jae	LBB6_32
## %bb.31:
	movl	$1, %esi
	xorl	%ebx, %ebx
	testq	%rcx, %rcx
	jne	LBB6_35
	jmp	LBB6_37
LBB6_32:
	subq	%rcx, %r10
	movl	$1, %esi
	xorl	%ebx, %ebx
	.p2align	4, 0x90
LBB6_33:                                ## =>This Inner Loop Header: Depth=1
	movslq	(%rdi,%rbx,4), %rdx
	imulq	%rsi, %rdx
	movslq	4(%rdi,%rbx,4), %rsi
	movslq	8(%rdi,%rbx,4), %rax
	imulq	%rsi, %rax
	imulq	%rdx, %rax
	movslq	12(%rdi,%rbx,4), %rdx
	movslq	16(%rdi,%rbx,4), %rsi
	imulq	%rdx, %rsi
	movslq	20(%rdi,%rbx,4), %rdx
	imulq	%rsi, %rdx
	imulq	%rax, %rdx
	movslq	24(%rdi,%rbx,4), %rax
	movslq	28(%rdi,%rbx,4), %rsi
	imulq	%rax, %rsi
	imulq	%rdx, %rsi
	addq	$8, %rbx
	cmpq	%rbx, %r10
	jne	LBB6_33
## %bb.34:
	testq	%rcx, %rcx
	je	LBB6_37
LBB6_35:
	leaq	(%rdi,%rbx,4), %rax
	negq	%rcx
	.p2align	4, 0x90
LBB6_36:                                ## =>This Inner Loop Header: Depth=1
	movslq	(%rax), %rdx
	imulq	%rdx, %rsi
	addq	$4, %rax
	incq	%rcx
	jne	LBB6_36
LBB6_37:
	imulq	%r8, %rsi
	addq	%rsi, %r9
	movq	%r9, -128(%rbp)
LBB6_38:
Ltmp21:
	leaq	-160(%rbp), %rdi
	xorl	%esi, %esi
	xorl	%edx, %edx
	callq	__ZN2cv16MatConstIterator4seekEPKib
Ltmp22:
## %bb.39:
	movl	-116(%rbp), %r8d
	cmpl	$2, %r8d
	jg	LBB6_41
## %bb.40:
	movslq	-112(%rbp), %rax
	movslq	-108(%rbp), %rsi
	imulq	%rax, %rsi
	testq	%rsi, %rsi
	jne	LBB6_49
	jmp	LBB6_53
LBB6_41:
	movq	-56(%rbp), %rcx
	leaq	-1(%r8), %rsi
	movl	%r8d, %r9d
	andl	$7, %r9d
	cmpq	$7, %rsi
	jae	LBB6_43
## %bb.42:
	movl	$1, %esi
	xorl	%edi, %edi
	testq	%r9, %r9
	jne	LBB6_46
	jmp	LBB6_48
LBB6_43:
	subq	%r9, %r8
	movl	$1, %esi
	xorl	%edi, %edi
	.p2align	4, 0x90
LBB6_44:                                ## =>This Inner Loop Header: Depth=1
	movslq	(%rcx,%rdi,4), %rbx
	imulq	%rsi, %rbx
	movslq	4(%rcx,%rdi,4), %rsi
	movslq	8(%rcx,%rdi,4), %rdx
	imulq	%rsi, %rdx
	imulq	%rbx, %rdx
	movslq	12(%rcx,%rdi,4), %rsi
	movslq	16(%rcx,%rdi,4), %rbx
	imulq	%rsi, %rbx
	movslq	20(%rcx,%rdi,4), %rax
	imulq	%rbx, %rax
	imulq	%rdx, %rax
	movslq	24(%rcx,%rdi,4), %rdx
	movslq	28(%rcx,%rdi,4), %rsi
	imulq	%rdx, %rsi
	imulq	%rax, %rsi
	addq	$8, %rdi
	cmpq	%rdi, %r8
	jne	LBB6_44
## %bb.45:
	testq	%r9, %r9
	je	LBB6_48
LBB6_46:
	leaq	(%rcx,%rdi,4), %rcx
	negq	%r9
	.p2align	4, 0x90
LBB6_47:                                ## =>This Inner Loop Header: Depth=1
	movslq	(%rcx), %rax
	imulq	%rax, %rsi
	addq	$4, %rcx
	incq	%r9
	jne	LBB6_47
LBB6_48:
	testq	%rsi, %rsi
	je	LBB6_53
LBB6_49:
	movq	-160(%rbp), %rax
	testq	%rax, %rax
	je	LBB6_53
## %bb.50:
	movq	-152(%rbp), %rcx
	imulq	%rsi, %rcx
	movq	-144(%rbp), %rax
	movq	-128(%rbp), %rdx
	addq	%rax, %rcx
	movq	%rcx, -144(%rbp)
	cmpq	-136(%rbp), %rcx
	jb	LBB6_52
## %bb.51:
	cmpq	%rcx, %rdx
	ja	LBB6_53
LBB6_52:
	movq	%rax, -144(%rbp)
Ltmp23:
	leaq	-160(%rbp), %rdi
	movl	$1, %edx
	callq	__ZN2cv16MatConstIterator4seekElb
Ltmp24:
LBB6_53:
	leaq	-200(%rbp), %rbx
	.p2align	4, 0x90
LBB6_54:                                ## =>This Inner Loop Header: Depth=1
	movq	-200(%rbp), %rax
	cmpq	-160(%rbp), %rax
	je	LBB6_59
## %bb.55:                              ##   in Loop: Header=BB6_54 Depth=1
	movq	-184(%rbp), %rax
	jmp	LBB6_56
	.p2align	4, 0x90
LBB6_59:                                ##   in Loop: Header=BB6_54 Depth=1
	movq	-184(%rbp), %rax
	cmpq	-144(%rbp), %rax
	je	LBB6_60
LBB6_56:                                ##   in Loop: Header=BB6_54 Depth=1
	notb	(%rax)
	notb	1(%rax)
	notb	2(%rax)
	cmpq	$0, -200(%rbp)
	je	LBB6_54
## %bb.57:                              ##   in Loop: Header=BB6_54 Depth=1
	movq	-184(%rbp), %rax
	movq	-192(%rbp), %rcx
	addq	%rax, %rcx
	movq	%rcx, -184(%rbp)
	cmpq	-168(%rbp), %rcx
	jb	LBB6_54
## %bb.58:                              ##   in Loop: Header=BB6_54 Depth=1
	movq	%rax, -184(%rbp)
Ltmp26:
	movl	$1, %esi
	movl	$1, %edx
	movq	%rbx, %rdi
	callq	__ZN2cv16MatConstIterator4seekElb
Ltmp27:
	jmp	LBB6_54
LBB6_60:
	movq	-64(%rbp), %rax
	testq	%rax, %rax
	je	LBB6_63
## %bb.61:
	lock		decl	20(%rax)
	jne	LBB6_63
## %bb.62:
Ltmp34:
	leaq	-120(%rbp), %rdi
	callq	__ZN2cv3Mat10deallocateEv
Ltmp35:
LBB6_63:
	movq	$0, -64(%rbp)
	movq	$0, -80(%rbp)
	movq	$0, -88(%rbp)
	movq	$0, -96(%rbp)
	movq	$0, -104(%rbp)
	cmpl	$0, -116(%rbp)
	jle	LBB6_66
## %bb.64:
	movq	-56(%rbp), %rax
	xorl	%ecx, %ecx
	.p2align	4, 0x90
LBB6_65:                                ## =>This Inner Loop Header: Depth=1
	movl	$0, (%rax,%rcx,4)
	incq	%rcx
	movslq	-116(%rbp), %rdx
	cmpq	%rdx, %rcx
	jl	LBB6_65
LBB6_66:
	movq	-48(%rbp), %rdi
	cmpq	%r14, %rdi
	je	LBB6_68
## %bb.67:
Ltmp36:
	callq	__ZN2cv8fastFreeEPv
Ltmp37:
LBB6_68:
	addq	$184, %rsp
	popq	%rbx
	popq	%r14
	popq	%r15
	popq	%rbp
	retq
LBB6_82:
Ltmp38:
	movq	%rax, %rdi
	callq	___clang_call_terminate
LBB6_71:
Ltmp20:
	jmp	LBB6_72
LBB6_3:
Ltmp12:
	movq	%rax, %rbx
	movq	-64(%rbp), %rax
	testq	%rax, %rax
	je	LBB6_6
## %bb.4:
	lock		decl	20(%rax)
	jne	LBB6_6
## %bb.5:
Ltmp13:
	leaq	-120(%rbp), %rdi
	callq	__ZN2cv3Mat10deallocateEv
Ltmp14:
LBB6_6:
	movq	$0, -64(%rbp)
	movq	$0, -80(%rbp)
	movq	$0, -88(%rbp)
	movq	$0, -96(%rbp)
	movq	$0, -104(%rbp)
	cmpl	$0, -116(%rbp)
	jle	LBB6_9
## %bb.7:
	movq	-56(%rbp), %rax
	xorl	%ecx, %ecx
	.p2align	4, 0x90
LBB6_8:                                 ## =>This Inner Loop Header: Depth=1
	movl	$0, (%rax,%rcx,4)
	incq	%rcx
	movslq	-116(%rbp), %rdx
	cmpq	%rdx, %rcx
	jl	LBB6_8
LBB6_9:
	movq	-48(%rbp), %rdi
	cmpq	%r14, %rdi
	je	LBB6_80
## %bb.10:
Ltmp15:
	callq	__ZN2cv8fastFreeEPv
Ltmp16:
	jmp	LBB6_80
LBB6_81:
Ltmp17:
	movq	%rax, %rdi
	callq	___clang_call_terminate
LBB6_70:
Ltmp25:
	jmp	LBB6_72
LBB6_69:
Ltmp28:
LBB6_72:
	movq	%rax, %rbx
	movq	-64(%rbp), %rax
	testq	%rax, %rax
	je	LBB6_75
## %bb.73:
	lock		decl	20(%rax)
	jne	LBB6_75
## %bb.74:
Ltmp29:
	leaq	-120(%rbp), %rdi
	callq	__ZN2cv3Mat10deallocateEv
Ltmp30:
LBB6_75:
	movq	$0, -64(%rbp)
	movq	$0, -80(%rbp)
	movq	$0, -88(%rbp)
	movq	$0, -96(%rbp)
	movq	$0, -104(%rbp)
	cmpl	$0, -116(%rbp)
	jle	LBB6_78
## %bb.76:
	movq	-56(%rbp), %rax
	xorl	%ecx, %ecx
	.p2align	4, 0x90
LBB6_77:                                ## =>This Inner Loop Header: Depth=1
	movl	$0, (%rax,%rcx,4)
	incq	%rcx
	movslq	-116(%rbp), %rdx
	cmpq	%rdx, %rcx
	jl	LBB6_77
LBB6_78:
	movq	-48(%rbp), %rdi
	cmpq	%r14, %rdi
	je	LBB6_80
## %bb.79:
Ltmp31:
	callq	__ZN2cv8fastFreeEPv
Ltmp32:
LBB6_80:
	movq	%rbx, %rdi
	callq	__Unwind_Resume
	ud2
LBB6_83:
Ltmp33:
	movq	%rax, %rdi
	callq	___clang_call_terminate
Lfunc_end2:
	.cfi_endproc
	.section	__TEXT,__gcc_except_tab
	.p2align	2
GCC_except_table6:
Lexception2:
	.byte	255                     ## @LPStart Encoding = omit
	.byte	155                     ## @TType Encoding = indirect pcrel sdata4
	.asciz	"\360"                  ## @TType base offset
	.byte	3                       ## Call site Encoding = udata4
	.byte	104                     ## Call site table length
Lset14 = Ltmp10-Lfunc_begin2            ## >> Call Site 1 <<
	.long	Lset14
Lset15 = Ltmp11-Ltmp10                  ##   Call between Ltmp10 and Ltmp11
	.long	Lset15
Lset16 = Ltmp12-Lfunc_begin2            ##     jumps to Ltmp12
	.long	Lset16
	.byte	0                       ##   On action: cleanup
Lset17 = Ltmp18-Lfunc_begin2            ## >> Call Site 2 <<
	.long	Lset17
Lset18 = Ltmp19-Ltmp18                  ##   Call between Ltmp18 and Ltmp19
	.long	Lset18
Lset19 = Ltmp20-Lfunc_begin2            ##     jumps to Ltmp20
	.long	Lset19
	.byte	0                       ##   On action: cleanup
Lset20 = Ltmp21-Lfunc_begin2            ## >> Call Site 3 <<
	.long	Lset20
Lset21 = Ltmp24-Ltmp21                  ##   Call between Ltmp21 and Ltmp24
	.long	Lset21
Lset22 = Ltmp25-Lfunc_begin2            ##     jumps to Ltmp25
	.long	Lset22
	.byte	0                       ##   On action: cleanup
Lset23 = Ltmp26-Lfunc_begin2            ## >> Call Site 4 <<
	.long	Lset23
Lset24 = Ltmp27-Ltmp26                  ##   Call between Ltmp26 and Ltmp27
	.long	Lset24
Lset25 = Ltmp28-Lfunc_begin2            ##     jumps to Ltmp28
	.long	Lset25
	.byte	0                       ##   On action: cleanup
Lset26 = Ltmp34-Lfunc_begin2            ## >> Call Site 5 <<
	.long	Lset26
Lset27 = Ltmp37-Ltmp34                  ##   Call between Ltmp34 and Ltmp37
	.long	Lset27
Lset28 = Ltmp38-Lfunc_begin2            ##     jumps to Ltmp38
	.long	Lset28
	.byte	1                       ##   On action: 1
Lset29 = Ltmp13-Lfunc_begin2            ## >> Call Site 6 <<
	.long	Lset29
Lset30 = Ltmp16-Ltmp13                  ##   Call between Ltmp13 and Ltmp16
	.long	Lset30
Lset31 = Ltmp17-Lfunc_begin2            ##     jumps to Ltmp17
	.long	Lset31
	.byte	1                       ##   On action: 1
Lset32 = Ltmp29-Lfunc_begin2            ## >> Call Site 7 <<
	.long	Lset32
Lset33 = Ltmp32-Ltmp29                  ##   Call between Ltmp29 and Ltmp32
	.long	Lset33
Lset34 = Ltmp33-Lfunc_begin2            ##     jumps to Ltmp33
	.long	Lset34
	.byte	1                       ##   On action: 1
Lset35 = Ltmp32-Lfunc_begin2            ## >> Call Site 8 <<
	.long	Lset35
Lset36 = Lfunc_end2-Ltmp32              ##   Call between Ltmp32 and Lfunc_end2
	.long	Lset36
	.long	0                       ##     has no landing pad
	.byte	0                       ##   On action: cleanup
	.byte	1                       ## >> Action Record 1 <<
                                        ##   Catch TypeInfo 1
	.byte	0                       ##   No further actions
                                        ## >> Catch TypeInfos <<
	.long	0                       ## TypeInfo 1
	.p2align	2
                                        ## -- End function

-O0

	.globl	__Z19NegativRawPtrAccessRN2cv3MatE ## -- Begin function _Z19NegativRawPtrAccessRN2cv3MatE
	.p2align	4, 0x90
__Z19NegativRawPtrAccessRN2cv3MatE:     ## @_Z19NegativRawPtrAccessRN2cv3MatE
	.cfi_startproc
## %bb.0:
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
	subq	$32, %rsp
	movq	%rdi, -8(%rbp)
	movl	$0, -12(%rbp)
LBB24_1:                                ## =>This Loop Header: Depth=1
                                        ##     Child Loop BB24_3 Depth 2
	movl	-12(%rbp), %eax
	movq	-8(%rbp), %rcx
	cmpl	8(%rcx), %eax
	jge	LBB24_8
## %bb.2:                               ##   in Loop: Header=BB24_1 Depth=1
	xorl	%edx, %edx
	movq	-8(%rbp), %rdi
	movl	-12(%rbp), %esi
	callq	__ZN2cv3Mat3ptrINS_3VecIhLi3EEEEEPT_ii
	movq	%rax, -24(%rbp)
	movq	-24(%rbp), %rax
	movq	-8(%rbp), %rdi
	movslq	12(%rdi), %rdi
	imulq	$3, %rdi, %rdi
	addq	%rdi, %rax
	movq	%rax, -32(%rbp)
LBB24_3:                                ##   Parent Loop BB24_1 Depth=1
                                        ## =>  This Inner Loop Header: Depth=2
	movq	-24(%rbp), %rax
	cmpq	-32(%rbp), %rax
	je	LBB24_6
## %bb.4:                               ##   in Loop: Header=BB24_3 Depth=2
	movl	$255, %eax
	movq	-24(%rbp), %rcx
	movzbl	(%rcx), %edx
	movl	%eax, %esi
	subl	%edx, %esi
	movb	%sil, %dil
	movq	-24(%rbp), %rcx
	movb	%dil, (%rcx)
	movq	-24(%rbp), %rcx
	movzbl	1(%rcx), %edx
	movl	%eax, %esi
	subl	%edx, %esi
	movb	%sil, %dil
	movq	-24(%rbp), %rcx
	movb	%dil, 1(%rcx)
	movq	-24(%rbp), %rcx
	movzbl	2(%rcx), %edx
	subl	%edx, %eax
	movb	%al, %dil
	movq	-24(%rbp), %rcx
	movb	%dil, 2(%rcx)
## %bb.5:                               ##   in Loop: Header=BB24_3 Depth=2
	movq	-24(%rbp), %rax
	addq	$3, %rax
	movq	%rax, -24(%rbp)
	jmp	LBB24_3
LBB24_6:                                ##   in Loop: Header=BB24_1 Depth=1
	jmp	LBB24_7
LBB24_7:                                ##   in Loop: Header=BB24_1 Depth=1
	movl	-12(%rbp), %eax
	addl	$1, %eax
	movl	%eax, -12(%rbp)
	jmp	LBB24_1
LBB24_8:
	addq	$32, %rsp
	popq	%rbp
	retq
	.cfi_endproc
                                        ## -- End function
	.globl	__Z19NegativeMatIteratorRN2cv3MatE ## -- Begin function _Z19NegativeMatIteratorRN2cv3MatE
	.p2align	4, 0x90
__Z19NegativeMatIteratorRN2cv3MatE:     ## @_Z19NegativeMatIteratorRN2cv3MatE
Lfunc_begin1:
	.cfi_startproc
	.cfi_personality 155, ___gxx_personality_v0
	.cfi_lsda 16, Lexception1
## %bb.0:
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
	subq	$256, %rsp              ## imm = 0x100
	movq	%rdi, -8(%rbp)
	movq	-8(%rbp), %rsi
	leaq	-112(%rbp), %rdi
	movq	%rdi, -224(%rbp)        ## 8-byte Spill
	callq	__ZN2cv4Mat_INS_3VecIhLi3EEEEC1ERKNS_3MatE
	movq	-224(%rbp), %rsi        ## 8-byte Reload
	movq	%rsi, -16(%rbp)
	movq	-16(%rbp), %rsi
Ltmp5:
	leaq	-152(%rbp), %rdi
	callq	__ZN2cv4Mat_INS_3VecIhLi3EEEE5beginEv
Ltmp6:
	jmp	LBB26_1
LBB26_1:
	movq	-16(%rbp), %rsi
Ltmp7:
	leaq	-208(%rbp), %rdi
	callq	__ZN2cv4Mat_INS_3VecIhLi3EEEE3endEv
Ltmp8:
	jmp	LBB26_2
LBB26_2:
	jmp	LBB26_3
LBB26_3:                                ## =>This Inner Loop Header: Depth=1
Ltmp9:
	leaq	-152(%rbp), %rdi
	leaq	-208(%rbp), %rsi
	callq	__ZN2cvneINS_3VecIhLi3EEEEEbRKNS_12MatIterator_IT_EES7_
Ltmp10:
	movb	%al, -225(%rbp)         ## 1-byte Spill
	jmp	LBB26_4
LBB26_4:                                ##   in Loop: Header=BB26_3 Depth=1
	movb	-225(%rbp), %al         ## 1-byte Reload
	testb	$1, %al
	jne	LBB26_7
	jmp	LBB26_5
LBB26_5:
	leaq	-112(%rbp), %rdi
	callq	__ZN2cv4Mat_INS_3VecIhLi3EEEED1Ev
	jmp	LBB26_11
LBB26_6:
Ltmp15:
	leaq	-112(%rbp), %rdi
	movl	%edx, %ecx
	movq	%rax, -160(%rbp)
	movl	%ecx, -164(%rbp)
	callq	__ZN2cv4Mat_INS_3VecIhLi3EEEED1Ev
	jmp	LBB26_12
LBB26_7:                                ##   in Loop: Header=BB26_3 Depth=1
Ltmp11:
	leaq	-152(%rbp), %rdi
	callq	__ZNK2cv12MatIterator_INS_3VecIhLi3EEEEdeEv
Ltmp12:
	movq	%rax, -240(%rbp)        ## 8-byte Spill
	jmp	LBB26_8
LBB26_8:                                ##   in Loop: Header=BB26_3 Depth=1
	movl	$255, %eax
	movq	-240(%rbp), %rcx        ## 8-byte Reload
	movq	%rcx, -216(%rbp)
	movq	-216(%rbp), %rdx
	movzbl	(%rdx), %esi
	movl	%eax, %edi
	subl	%esi, %edi
	movb	%dil, %r8b
	movq	-216(%rbp), %rdx
	movb	%r8b, (%rdx)
	movq	-216(%rbp), %rdx
	movzbl	1(%rdx), %esi
	movl	%eax, %edi
	subl	%esi, %edi
	movb	%dil, %r8b
	movq	-216(%rbp), %rdx
	movb	%r8b, 1(%rdx)
	movq	-216(%rbp), %rdx
	movzbl	2(%rdx), %esi
	subl	%esi, %eax
	movb	%al, %r8b
	movq	-216(%rbp), %rdx
	movb	%r8b, 2(%rdx)
## %bb.9:                               ##   in Loop: Header=BB26_3 Depth=1
Ltmp13:
	leaq	-152(%rbp), %rdi
	callq	__ZN2cv12MatIterator_INS_3VecIhLi3EEEEppEv
Ltmp14:
	movq	%rax, -248(%rbp)        ## 8-byte Spill
	jmp	LBB26_10
LBB26_10:                               ##   in Loop: Header=BB26_3 Depth=1
	jmp	LBB26_3
LBB26_11:
	addq	$256, %rsp              ## imm = 0x100
	popq	%rbp
	retq
LBB26_12:
	movq	-160(%rbp), %rdi
	callq	__Unwind_Resume
	ud2
Lfunc_end1:
	.cfi_endproc
	.section	__TEXT,__gcc_except_tab
	.p2align	2
GCC_except_table26:
Lexception1:
	.byte	255                     ## @LPStart Encoding = omit
	.byte	155                     ## @TType Encoding = indirect pcrel sdata4
	.byte	41                      ## @TType base offset
	.byte	3                       ## Call site Encoding = udata4
	.byte	39                      ## Call site table length
Lset7 = Lfunc_begin1-Lfunc_begin1       ## >> Call Site 1 <<
	.long	Lset7
Lset8 = Ltmp5-Lfunc_begin1              ##   Call between Lfunc_begin1 and Ltmp5
	.long	Lset8
	.long	0                       ##     has no landing pad
	.byte	0                       ##   On action: cleanup
Lset9 = Ltmp5-Lfunc_begin1              ## >> Call Site 2 <<
	.long	Lset9
Lset10 = Ltmp14-Ltmp5                   ##   Call between Ltmp5 and Ltmp14
	.long	Lset10
Lset11 = Ltmp15-Lfunc_begin1            ##     jumps to Ltmp15
	.long	Lset11
	.byte	0                       ##   On action: cleanup
Lset12 = Ltmp14-Lfunc_begin1            ## >> Call Site 3 <<
	.long	Lset12
Lset13 = Lfunc_end1-Ltmp14              ##   Call between Ltmp14 and Lfunc_end1
	.long	Lset13
	.long	0                       ##     has no landing pad
	.byte	0                       ##   On action: cleanup
	.p2align	2
                                        ## -- End function

NegativRawPtrAccess dla -O2 nie wywołuje żadnych funkcji, a w wersji -O0 tylko __ZN2cv3Mat3ptrINS_3VecIhLi3EEEEEPT_ii (po demangle _cv::Vec<unsigned char, 3>* cv::Mat::ptr<cv::Vec<unsigned char, 3> >(int, int)).

Natomiast NegativeMatIterator wywołuje od groma funkcji, w tym _cv::Mat::deallocate(), więc wygląda na to, że robi jakąś kopię.

Dlatego radzę dopisać testy do tego kodu czy faktycznie obie wersje robią to samo.
Szczególnie, że cv::Mat_<Tp>::Mat(const Mat& m)

copy/conversion contructor. If m is of different type, it's converted


sprawdziłem, nie robi kopii, ten `deallocate` to tylko na wypadek, gdyby jednak kopia była potrzebna.

1 użytkowników online, w tym zalogowanych: 0, gości: 1