|
This code:
template<_sizetype M> class multi_ptr {
public:
typedef multi_ptr<M - 1>::pointer *pointer;
};
class multi_ptr<0>{
public:
typedef T pointer;
};
gives this error:
error: type ‘multi_array<N, T>::multi_ptr<(M - 1)>’ is not derived from type ‘multi_array<N, T>::multi_ptr<M>’ expected ‘;’ before ‘*’ token
I found the problem, I fix by adding 'template':
template<_sizetype M> class multi_ptr {
public:
typedef typename multi_ptr<M - 1>::pointer *pointer;
};
class multi_ptr<0>{
public:
typedef T pointer;
};
Now, there is a new error:
error: too few template-parameter-lists
pointing at
class multi_ptr<0>{
I added template<> before this and still not working.
|
|
|
|
|
I'm not too familiar with templates, but I am with 'new' and 'delete'. How does your class deallocate the memory when your done with it?
|
|
|
|
|
Hi,
I am sorry to reply that late.
Actually, in this implementation, a deallocator is not declared. Anyway, I am working on updating this article. The updates include avoiding nested class declaration, gcc compatability, solving some problem regarding Visual Studio 8.
Thanks alot for time, Bye.
|
|
|
|
|
Hi
I was wondering if it was possible to store the required number of dimensions in a variable and pass this to the template, as in the following code segment.
int main(void)
{
int var = 4;
multi_array<int, var=""> ia4(4, 3, 56, 2);
ia4[0][0][0][0]=99;
cout<
|
|
|
|
|
oops this should have had
multi_array <int, var> ia4(4, 3, 56, 2);
|
|
|
|
|
hi,
if u mean this
<code>
multi_array<int, 4> ia4(4, 3, 56, 2);
</code>
the answer is yessssss!!
thx!!
Yours, M. Hossny
|
|
|
|
|
Hi
Kinda,
But instead explicitly specifying 4 as in your declaration, I just want to pass a variable for the number of dimensions.
<code>
int var = 4;
multi_array<int, var> ia4(4, 3, 56, 2);
</code>
If I am doing something silly, I apologise in advance
Thanks
SMJT
|
|
|
|
|
hi,
this is a compile time customization for a multi-dimensional array!!
IT IS ACOMPILE TIME ONLY!! AND THERE IS NO WAY TO HANDLE IT WITH RUNTIME ARGUMENTS!!
another alternative is to develop ur casting operators that converts between arrays of different dimensions!!!
thx alot, bye
Yours, M. Hossny
|
|
|
|
|
I got 2 compile errors on VC++ .NET 2003 in multi_array.h on line 36:
error C2143: sintax error : missing ';'before '*'
error C2501: 'multi_array<t,n>::multi_ptr <m>::pointer' : missing storage-class or type specifiers
The compiler seems not to be understanding the recursive type definition. Any idea/sugestion on what might be wrong?
Nice article, BTW.
/*----------------------------------*
Rui A. Rebelo
rui_a_rebelo@despammed.com
*----------------------------------*/
|
|
|
|
|
yeah i faced the same problem but it works good on VC++ .NET 2002
bye!!
Yours, M. Hossny
|
|
|
|
|
hi,
i found a solution for this problem in VC++ .NET 2003
prefix the dependent type bye typename keyword. it tells the compiler that this type will be defined later.
multi_ptr is now different
<br />
template<_sizetype M><br />
class multi_ptr<br />
{<br />
public:<br />
typedef double T;<br />
typedef typename multi_ptr<M-1>::pointer* pointer; <br />
};<br />
<br />
class multi_ptr<0><br />
{<br />
public:<br />
typedef double T;<br />
typedef T pointer;<br />
};
Yours, M. Hossny
|
|
|
|
|
Why can it be faster ?
Doesn't compiler perform (4*i+j) behind the scenes ?
|
|
|
|
|
nope the program, just use indirect addressing modes
it s a pointer to pointer, and not just a single array with indexing tricks .
Yours, M. Hossny
|
|
|
|
|
Nope, the code generates the same machine code.
int main (int argc, char * argv[])
{
int pi1[20];
int pi2[4][5];
memset (pi1, 0, sizeof (pi1));
memset (pi2, 0, sizeof (pi2));
int a = 0;
int b = 0;
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
{
a += pi2[i][j];
}
}
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
{
b += pi1[i*4+j];
}
}
printf ("%d %d", a, b);
return 0;
}
Has this output
PUBLIC _main
PUBLIC ??_C@_05OKMLJOMC@?$CFd?5?$CFd?$AA@ ; `string'
EXTRN _printf:NEAR
; COMDAT ??_C@_05OKMLJOMC@?$CFd?5?$CFd?$AA@
; File c:\dev\exo\exo.cpp
CONST SEGMENT
??_C@_05OKMLJOMC@?$CFd?5?$CFd?$AA@ DB '%d %d', 00H ; `string'
; Function compile flags: /Ogty
CONST ENDS
; COMDAT _main
_TEXT SEGMENT
_pi2$ = -160
_pi1$ = -80
_argc$ = 8
_argv$ = 12
_main PROC NEAR ; COMDAT
; 42 : {
00000 55 push ebp
00001 8b ec mov ebp, esp
00003 83 e4 f8 and esp, -8 ; fffffff8H
00006 81 ec a4 00 00
00 sub esp, 164 ; 000000a4H
0000c 53 push ebx
0000d 56 push esi
0000e 57 push edi
; 43 : int pi1[20];
; 44 : int pi2[4][5];
; 45 : memset (pi1, 0, sizeof (pi1));
0000f 33 c0 xor eax, eax
00011 b9 14 00 00 00 mov ecx, 20 ; 00000014H
00016 8d 7c 24 60 lea edi, DWORD PTR _pi1$[esp+176]
0001a f3 ab rep stosd
; 46 : memset (pi2, 0, sizeof (pi2));
0001c b9 14 00 00 00 mov ecx, 20 ; 00000014H
00021 8d 7c 24 10 lea edi, DWORD PTR _pi2$[esp+176]
00025 f3 ab rep stosd
; 47 : int a = 0;
00027 33 f6 xor esi, esi
; 48 : int b = 0;
00029 33 d2 xor edx, edx
0002b 8d 44 24 18 lea eax, DWORD PTR _pi2$[esp+184]
0002f b9 04 00 00 00 mov ecx, 4
$L690:
; 49 : for (int i = 0; i < 4; i++)
; 50 : {
; 51 : for (int j = 0; j < 4; j++)
; 52 : {
; 53 : a += pi2[i][j];
00034 8b 58 fc mov ebx, DWORD PTR [eax-4]
00037 8b 78 f8 mov edi, DWORD PTR [eax-8]
0003a 03 fb add edi, ebx
0003c 03 78 04 add edi, DWORD PTR [eax+4]
0003f 03 38 add edi, DWORD PTR [eax]
00041 03 f7 add esi, edi
00043 83 c0 14 add eax, 20 ; 00000014H
00046 49 dec ecx
00047 75 eb jne SHORT $L690
; 54 : }
; 55 : }
; 56 : for (int i = 0; i < 4; i++)
00049 8d 44 24 68 lea eax, DWORD PTR _pi1$[esp+184]
0004d b9 04 00 00 00 mov ecx, 4
$L698:
; 57 : {
; 58 : for (int j = 0; j < 4; j++)
; 59 : {
; 60 : b += pi1[i*4+j];
00052 8b 58 fc mov ebx, DWORD PTR [eax-4]
00055 8b 78 f8 mov edi, DWORD PTR [eax-8]
00058 03 fb add edi, ebx
0005a 03 78 04 add edi, DWORD PTR [eax+4]
0005d 03 38 add edi, DWORD PTR [eax]
0005f 03 d7 add edx, edi
00061 83 c0 10 add eax, 16 ; 00000010H
00064 49 dec ecx
00065 75 eb jne SHORT $L698
; 61 : }
; 62 : }
; 63 : printf ("%d %d", a, b);
00067 52 push edx
00068 56 push esi
00069 68 00 00 00 00 push OFFSET FLAT:??_C@_05OKMLJOMC@?$CFd?5?$CFd?$AA@
0006e e8 00 00 00 00 call _printf
00073 83 c4 0c add esp, 12 ; 0000000cH
; 64 : return 0;
; 65 : }
00076 5f pop edi
00077 5e pop esi
00078 33 c0 xor eax, eax
0007a 5b pop ebx
0007b 8b e5 mov esp, ebp
0007d 5d pop ebp
0007e c3 ret 0
_main ENDP
_TEXT ENDS
END
*NOTE* The inner loop has been unrolled.
The heart of the loop uses almost the EXACT same machine code.
Tim Smith
I'm going to patent thought. I have yet to see any prior art.
|
|
|
|
|
gotcha
this sounds right only with static arrays. static arrays are allocated in compile time as a long 1-d array of bytes.
try this for a dynamic allocated array it wont work .
<br />
int* pi1=new int[20];<br />
int* pi2=new int*[4];<br />
for(int i=0; i<4; i++)<br />
pi2[i]=new int[5];<br />
thx for time, salam.
Yours, M. Hossny
|
|
|
|
|
Lets look at it again with routines called GetAt (array, i, j)
For pi1 [20], here is what we have:
int GetAt (int pi1 [20], int i, int j)
{
return pi1 [i * 5 + j];
}
00000 8b 44 24 08 mov eax, DWORD PTR _i$[esp-4]
00004 8b 4c 24 0c mov ecx, DWORD PTR _j$[esp-4]
00008 8d 14 81 lea edx, DWORD PTR [ecx+eax*4]
0000b 8b 4c 24 04 mov ecx, DWORD PTR _pi1$[esp-4]
0000f 03 c2 add eax, edx
00011 8b 04 81 mov eax, DWORD PTR [ecx+eax*4]
(without the arglist access instructions)
00008 8d 14 81 lea edx, DWORD PTR [ecx+eax*4]
0000f 03 c2 add eax, edx
00011 8b 04 81 mov eax, DWORD PTR [ecx+eax*4]
Here is pi2 [4][5]
int GetAt (int pi2 [4][5], int i, int j)
{
return pi2 [i][j];
}
00000 8b 44 24 08 mov eax, DWORD PTR _i$[esp-4]
00004 8b 4c 24 0c mov ecx, DWORD PTR _j$[esp-4]
00008 8d 14 81 lea edx, DWORD PTR [ecx+eax*4]
0000b 8b 4c 24 04 mov ecx, DWORD PTR _pi2$[esp-4]
0000f 03 c2 add eax, edx
00011 8b 04 81 mov eax, DWORD PTR [ecx+eax*4]
(without the arglist access instructions)
00008 8d 14 81 lea edx, DWORD PTR [ecx+eax*4]
0000f 03 c2 add eax, edx
00011 8b 04 81 mov eax, DWORD PTR [ecx+eax*4]
Here is int **pi4;
int GetAt (int **pi4, int i, int j)
{
return pi4 [i] [j];
}
00000 8b 44 24 08 mov eax, DWORD PTR _i$[esp-4]
00004 8b 4c 24 04 mov ecx, DWORD PTR _pi4$[esp-4]
00008 8b 14 81 mov edx, DWORD PTR [ecx+eax*4]
0000b 8b 44 24 0c mov eax, DWORD PTR _j$[esp-4]
0000f 8b 04 82 mov eax, DWORD PTR [edx+eax*4]
(without the arglist access instructions)
00008 8b 14 81 mov edx, DWORD PTR [ecx+eax*4]
0000f 8b 04 82 mov eax, DWORD PTR [edx+eax*4]
Here is the multi dim
int GetAt (multi_array <int, 2> &pi3, int i, int j)
{
return pi3 [i][j];
}
00000 8b 44 24 04 mov eax, DWORD PTR _pi3$[esp-4]
00004 8b 08 mov ecx, DWORD PTR [eax]
00006 8b 54 24 08 mov edx, DWORD PTR _i$[esp-4]
0000a 8b 04 91 mov eax, DWORD PTR [ecx+edx*4]
0000d 8b 4c 24 0c mov ecx, DWORD PTR _j$[esp-4]
00011 8b 04 88 mov eax, DWORD PTR [eax+ecx*4]
(without the arglist access instructions)
(note, the first instruction is an artifact of the
calling method and really shouldn't be held against the implementation.)
00004 8b 08 mov ecx, DWORD PTR [eax]
0000a 8b 04 91 mov eax, DWORD PTR [ecx+edx*4]
00011 8b 04 88 mov eax, DWORD PTR [eax+ecx*4]
As to the first question of is p[20] slower or faster than p[4][5] where we are using normal C arrays (as stated in the article), they are the same thing EXACTLY.
Now let us take a look at your multidim. What we are doing is trading off an LEA and an addition for an extra memory access. If we have an array of something that isn't nicely sized, such as a 54 byte class, then everyone ends up doing one IMUL and multidim has an extra memory access. Now when you consider that the multidim system has to allocate and deallocate memory, it doesn't make it very good when you have transient objects in a tight loop. Also, in a tight loop accessing a fair amount of memory, the extra memory access of multidim can blow your L1 or L2 cache and thus slow down your program.
I'm not saying multidim is bad. Not in the least. However, when you start talking about performance, things get very tricky, very fast. I have no idea which of the four is the fastest without running the program through a series of different array sizes and types using a tool such as VTune (Intel). However, the memory allocations and deallocations would kill a program with transient arrays in a tight loop. I would NEVER use this solution in places where I just need a simple multidim array that can be allocated on the stack. In the case of a multithreaded application, memory allocations run the risk of stalling threads on a multiprocessor box. (malloc/free is a LOT slower than "ADD SP, 48").
Tim Smith
I'm going to patent thought. I have yet to see any prior art.
|
|
|
|
|
"Also, in a tight loop accessing a fair amount of memory, the extra memory access of multidim can blow your L1 or L2 cache and thus slow down your program."
This is very important.
As I understand it, the possible performance gain in the multidim case ultimately comes from performing a table lookup instead of a multiplication.
Table lookups are usually slow in practice because of the potential for cache misses. One miss of the L2 cache is worse than a huge number of multiplies. You need to be very careful when doing timing tests to make sure that your test is realistic in its use of the cache.
For simple C arrays, there is often no multiply because LEA can do the work, so no benefit there. On P2, P3 a multiply only costs 4 clock cycles anyway.
But even when there is a slow multiply, I doubt you'll get a real benefit, even if you ignore the memory allocation costs. A poorly designed test might fool you into thinking it's better than it would be in a real application.
----
On the other hand, I think there are future gains to be had by template metaprocessing, so it's definitely worth pursuing these kinds of ideas. Have you had a look at Blitz++ and boost::numerical?
Incidentally, if you really want to chase after performance, have a read of Agner Fog's assembly optimisation manual and look at the floating point chapter. (search for "Agner Fog" on google). He was able to beat the DAXPY code that Intel published as their demo of how fast Pentiums could run!
|
|
|
|
|
Good points in this discussion!
You seem to have stumbled on my first rule of optimization:
Rule 1: Don't bother. (...unless instructed otherwise by a profiler.)
Chances are your bottlenecks are elsewhere in your program, and programmers are fairly notorious for "optimizing" something that would in practice be insignificant compared to, say, a network x-fer or a large DB query.
~Nitron.
ññòòïðïðB A start
|
|
|
|
|
I think the biggest reason for this is with the pipelined architecture of current AMD and Intel chips an integer multiply is effectively one clock tick when there is not a dependency with the next or previous instruction(s) and there is not a branch mispredict. A table lookup that causes a L1 cache miss is going to take much longer than that.
John
|
|
|
|
|
sorry 4 being that late .
i guess
<br />
int GetAt (int pi1 [20], int i, int j)<br />
int GetAt (int pi2 [4][5], int i, int j)<br />
are the same, multi-dimensional static arrays r actually implemented as a single uni-dimensional array. Thats why u cant pass an int[4][5] into an int** parameter.
I still find ur comparison unfair!! U ve to compare m/c code for an int** vs int* .
And i agree with u!! LEA instruction captures the effective address in one shot .
However consider this for a highr dimensional array. I work in movie enhancement. This means working in at least 4-dimensional arrays (2-dims for spatial coordinates in each frame, 1-dim for color components, and additional 1-dim for timing). Now consider one more dimension to work on multiple movies!!!
Acessing a single color component in some frame in ... costs toooo much as i thought!!
I guess multi-dimensional arrays provides acess via 5 hops!!
Now consider this trial
<br />
int getat(int* p1, int i, int j, int k, int l)<br />
{<br />
return p1[l*60+k*12+i*4+j];<br />
}<br />
<br />
DEBUG CODE GENERATION<br />
mov eax, DWORD PTR _l$[ebp]<br />
imul eax, 60 ; 0000003cH<br />
mov ecx, DWORD PTR _k$[ebp]<br />
imul ecx, 12 ; 0000000cH<br />
add ecx, DWORD PTR _j$[ebp]<br />
add ecx, eax<br />
mov edx, DWORD PTR _i$[ebp]<br />
lea eax, DWORD PTR [ecx+edx*4]<br />
mov ecx, DWORD PTR _p1$[ebp]<br />
mov eax, DWORD PTR [ecx+eax*4]<br />
<br />
<br />
RELEASE OPTIMIZED CODE GENERATION<br />
mov eax, DWORD PTR _l$[esp-4]<br />
mov ecx, DWORD PTR _k$[esp-4]<br />
lea edx, DWORD PTR [ecx+eax*4]<br />
mov ecx, DWORD PTR _i$[esp-4]<br />
add eax, edx<br />
lea edx, DWORD PTR [ecx+eax*2]<br />
mov ecx, DWORD PTR _j$[esp-4]<br />
add eax, edx<br />
lea edx, DWORD PTR [ecx+eax*4]<br />
mov eax, DWORD PTR _p1$[esp-4]<br />
mov eax, DWORD PTR [eax+edx*4]<br />
and a multi-dimensional array like
<br />
int getat(int**** p2, int i, int j, int k, int l)<br />
{<br />
return p2[i][j][k][l];<br />
}<br />
<br />
DEBUG CODE GENERATION<br />
mov eax, DWORD PTR _i$[ebp]<br />
mov ecx, DWORD PTR _p2$[ebp]<br />
mov edx, DWORD PTR [ecx+eax*4]<br />
mov eax, DWORD PTR _j$[ebp]<br />
mov ecx, DWORD PTR [edx+eax*4]<br />
mov edx, DWORD PTR _k$[ebp]<br />
mov eax, DWORD PTR [ecx+edx*4]<br />
mov ecx, DWORD PTR _l$[ebp]<br />
mov eax, DWORD PTR [eax+ecx*4]<br />
<br />
RELEASE OPTIMIZED CODE GENERATION<br />
mov eax, DWORD PTR _i$[esp-4]<br />
mov ecx, DWORD PTR _p2$[esp-4]<br />
mov edx, DWORD PTR [ecx+eax*4]<br />
mov eax, DWORD PTR _j$[esp-4]<br />
mov ecx, DWORD PTR [edx+eax*4]<br />
mov edx, DWORD PTR _k$[esp-4]<br />
mov eax, DWORD PTR [ecx+edx*4]<br />
mov ecx, DWORD PTR _l$[esp-4]<br />
mov eax, DWORD PTR [eax+ecx*4]<br />
u got my point!!
i know nothing about cache optimization, but i guess multi-dimensional arrays are more intuitive, faster to compile and optimize, and faster in terms of cpu clock tics at high orders.
thx alot for time, salam .
Yours, M. Hossny
|
|
|
|
|