What is the most efficient matrix or vector in C?

4

I would like to know which of the two is more efficient in terms of execution time (it does not matter if it consumes more resources than others). This comparison is based on the following:

```Acceder de forma matriz[fila][columna];
Acceder de forma vector[totalcolumnas * fila + columna];
```

Or if there is another better method.

asked by Islam Linarez 12.01.2017 в 06:24
source

1 answer

5

Case 1: Random access

Let's check what happens with a simple case study:

``````int matrizPila[10][10];
int **matrizDinamica;
int vectorPila[100];
int* vectorDinamico;

int funcMatrizPila(int fila, int columna)
{
return matrizPila[fila][columna];
}

int funcMatrizDinamica(int fila, int columna)
{
return matrizDinamica[fila][columna];
}

int funcVectorPila(int fila, int columna)
{
return vectorPila[fila*10+columna];
}

int funcVectorDinamico(int fila, int columna, int numColumnas)
{
return vectorDinamico[fila*numColumnas+columna];
}
``````

The variables I have labeled as `extern` to avoid warnings when compiling.

Compiling the previous code with gcc 6.3 (it is a C ++ compilation although I do not think you gave much of a compilation in C) the result is the following:

``````funcMatrizPila(int, int):
movsx   rdi, edi
movsx   rsi, esi
lea     rax, [rdi+rdi*4]
lea     rax, [rsi+rax*2]
mov     eax, DWORD PTR matrizPila[0+rax*4]
ret
funcMatrizDinamica(int, int):
mov     rax, QWORD PTR matrizDinamica[rip]
movsx   rdi, edi
movsx   rsi, esi
mov     rax, QWORD PTR [rax+rdi*8]
mov     eax, DWORD PTR [rax+rsi*4]
ret
funcVectorPila(int, int):
lea     eax, [rdi+rdi*4]
lea     eax, [rsi+rax*2]
cdqe
mov     eax, DWORD PTR vectorPila[0+rax*4]
ret
funcVectorDinamico(int, int, int):
imul    edi, edx
mov     rax, QWORD PTR vectorDinamico[rip]
lea     edx, [rdi+rsi]
movsx   rdx, edx
mov     eax, DWORD PTR [rax+rdx*4]
ret
``````

If we analyze the resulting code we can draw some conclusions:

• In the case of `matrizPila` and `vectorPila` the result is practically the same. This is because in both cases the memory is contiguous and is organized in the same way, then the operation that allows to obtain the value of a cell is the same.
• In the case of `matrizDinamica` the compiler has to solve two indirections to obtain the value (one to jump to the corresponding row and another to jump to the column for the given row). This results in a poorer performance compared to the two previous cases.
• In the case of `vectorDinamico` , the compiler is forced to perform an indirection more than `vectorPila` . This translates, as in the previous case, into a lower performance. Also now you have to make a product manually ( `fila*numColumnas` - > `imul edi,edx` ), which is an important penalty.

A detail that is not seen in the assembler but that also affects is the location of the data. Computers tend to cache memory to allow quick access to data. This tends to greatly accelerate operations if the data tends to be all together. In the case of `matrizDinamica` this point is not fulfilled since each row will be in a random position of the memory and this will force the system to repaginate the cache constantly and this can easily translate into a loss of performance quite noticeable.

Case 2: access to known positions at compile time

``````int matrizPila[10][10];
int **matrizDinamica;
int vectorPila[100];
int* vectorDinamico;

int funcMatrizPila()
{
return matrizPila[3][4];
}

int funcMatrizDinamica()
{
return matrizDinamica[3][4];
}

int funcVectorPila()
{
return vectorPila[3*10+4];
}

int funcVectorDinamico(int numColumnas)
{
return vectorDinamico[3*numColumnas+4];
}
``````

For this code the resulting assembler is:

``````funcMatrizPila():
mov     eax, DWORD PTR matrizPila[rip+136]
ret
funcMatrizDinamica():
mov     rax, QWORD PTR matrizDinamica[rip]
mov     rax, QWORD PTR [rax+24]
mov     eax, DWORD PTR [rax+16]
ret
funcVectorPila():
mov     eax, DWORD PTR vectorPila[rip+136]
ret
funcVectorDinamico(int):
lea     eax, [rdi+rdi*2]
mov     rdx, QWORD PTR vectorDinamico[rip]
cdqe
mov     eax, DWORD PTR [rdx+16+rax*4]
ret
``````

In this case we observe the following:

• The access to `matrizPila` and `vectorPila` is exactly the same and the execution time tends to 0. The compiler is able to precalculate the offset of the jump, reducing the obtaining of the result to solve a simple sum.
• Access to `matrizDinamica` and `vectorDinamico` is simpler, which should imply a shorter execution time. Perhaps the greatest effect is achieved in the case of `vectorDinamico` , but the most costly thing is to solve the two indirectities have to continue making.

Seen this way the classification, in this case, will be very similar to that of case 1 but with shorter execution times.

Case study:

Let's now try all the cases we have seen to see how they behave. For this we can use a code like the following:

``````#include <time.h>
#include <stdio.h>
#include <stdlib.h>

int matrizPila[10][10];
int **matrizDinamica;
int vectorPila[100];
int* vectorDinamico;

int funcMatrizPila(int fila, int columna)
{
return matrizPila[fila][columna];
}

int funcMatrizDinamica(int fila, int columna)
{
return matrizDinamica[fila][columna];
}

int funcVectorPila(int fila, int columna)
{
return vectorPila[fila*10+columna];
}

int funcVectorDinamico(int fila, int columna, int numColumnas)
{
return vectorDinamico[fila*numColumnas+columna];
}

int funcMatrizPilaFijo()
{
return matrizPila[3][4];
}

int funcMatrizDinamicaFijo()
{
return matrizDinamica[3][4];
}

int funcVectorPilaFijo()
{
return vectorPila[3*10+4];
}

int funcVectorDinamicoFijo(int numColumnas)
{
return vectorDinamico[3*numColumnas+4];
}

int res = 0;

#define BEGIN_TEST \
clock_t begin = clock(); \
for( int i=0; i<10000000; i++) \
for( int j=0; j<100; j++)

#define END_TEST(nombre) \
clock_t end = clock(); \
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC; \
printf("%-22s: %f\n",nombre,time_spent);

void testFunc0( const char* funcName, int(*func)())
{
BEGIN_TEST

res += func();

END_TEST(funcName)
}

void testFunc1( const char* funcName, int(*func)(int), int var1)
{
BEGIN_TEST

res += func(var1);

END_TEST(funcName)
}

void testFunc2( const char* funcName, int(*func)(int, int), int var1, int var2)
{
BEGIN_TEST

res += func(var1,var2);

END_TEST(funcName)
}

void testFunc3( const char* funcName, int(*func)(int, int, int), int var1, int var2, int var3)
{
BEGIN_TEST

res += func(var1,var2,var3);

END_TEST(funcName)
}

int main()
{
matrizDinamica = (int**)malloc(10*sizeof(int*));
for(int i=0; i<10; i++)
matrizDinamica[i] = (int*)malloc(10*sizeof(int));

vectorDinamico = (int*)malloc(100*sizeof(int));

testFunc0( "funcMatrizPilaFijo",    funcMatrizPilaFijo);
testFunc0( "funcVectorPilaFijo",    funcVectorPilaFijo);
testFunc0( "funcMatrizDinamicaFijo",funcMatrizDinamicaFijo);
testFunc1( "funcVectorDinamicoFijo",funcVectorDinamicoFijo,10);
testFunc2( "funcMatrizPila",        funcMatrizPila,3,4);
testFunc2( "funcVectorPila",        funcVectorPila,3,4);
testFunc2( "funcMatrizDinamica",    funcMatrizDinamica,3,4);
testFunc3( "funcVectorDinamico",    funcVectorDinamico,3,4,10);
}
``````

In my case the results obtained are the following:

``````funcMatrizPilaFijo    : 1.810000
funcVectorPilaFijo    : 1.800000
funcMatrizDinamicaFijo: 1.800000
funcVectorDinamicoFijo: 2.092000
funcMatrizPila        : 2.161000
funcVectorPila        : 2.190000
funcMatrizDinamica    : 2.321000
funcVectorDinamico    : 2.940000
``````

There we see how effectively accessing known positions at compile time is an improvement in time (around 10% -20%).

One result that seems to contradict what I have said is that access times of `vectorDinamico` are worse than those obtained for `matrizDinamica` . This is so because in this very simple example the memory of the matrix not only has many possibilities to end up in consecutive positions but also due to its small size (400 bytes), it should not present too many problems for the paging of the cache. In more complex environments the result of `matrizDinamica` should be much worse.

answered by 12.01.2017 / 10:23
source