Zoran's Blog: Instructions per Second on current CPUs and how does this translate to high level languages

CPU Instructions per Second:

https://en.wikipedia.org/wiki/Instructions_per_second

MIPS = Million instructions per second
GIPS = Billion instructions per second

Synthetic benchmarks such as Dhrystone (as used by SIsoftware) are now generally used to estimate computer performance in commonly used applications. Below are Dhrystone results for current CPUs:

Here are some rough rules of thumb:

Let us assume conservative estimate of 5 GIPS/Ghz. Hence a one 3 GHz core can execute roughly 15 billion machine instructructions per second. Yet this is machine instructions, yet how could this translate to a high level language like C?

This is very tough to estimate, but lets assume a that for every line of C we have 10 machine instructions, then we a we can have 500 C MIPS/GHz. Hence a 2 GHz CPU can execute roughly 1 billion C instructions per second. Modern CPUs are very, very fast when application is CPU bound.

If we assume a C application transaction takes 10ms to process (and it is CPU bound) on a 2 GHz CPU then is is executing 2 (GHz) x 500 (C MIPS) x 0.01 = 10 million lines of C code. If we have requirement of 200 transactions per second and our CPU is running at 2 GHz then we need the following number of CPU cores:
200 (transactions per second) / (1 (second) / 0.01 (10ms) ) = 2 CPU cores

I used the below Sieve of Eratosthenes prime number C code to estimate C to machine instruction ratio:
https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes

Calculate primes up to 1,000,000:
#include <stdio.h>

int main()
{
int number,i,j;
number=1000000;

int primes[number+1];

//populating array with naturals numbers
for(i = 2; i<=number; i++)
primes[i] = i;

i = 2;
while ((i*i) <= number)
{
if (primes[i] != 0)
{
for(j=2; j<number; j++)
{
if (primes[i]*j > number)
break;
else
// Instead of deleteing , making elemnets 0
primes[primes[i]*j]=0;
}
}
i++;
}

for(i = 2; i<=number; i++)
{
//If number is not 0 then it is prime
if (primes[i]!=0)
printf("%d\n",primes[i]);
}

return 0;
}

http://www.delorie.com/djgpp/v2faq/faq8_20.html

$ gcc -O2 -S -c sieve.c # to produce assembler code

$ cat sieve.s

.file "sieve.c"

.section .rodata.str1.1,"aMS",@progbits,1

.LC0:

.string "%d\n"

.text

.p2align 4,,15

.globl main

.type main, @function

main:

.LFB11:

.cfi_startproc

pushq %rbp

.cfi_def_cfa_offset 16

.cfi_offset 6, -16

movl $2, %eax

movq %rsp, %rbp

.cfi_def_cfa_register 6

pushq %r12

pushq %rbx

subq $4000032, %rsp

leaq 15(%rsp), %r12

.cfi_offset 3, -32

.cfi_offset 12, -24

andq $-16, %r12

movq %r12, %rdx

.p2align 4,,10

.p2align 3

.L2:

movl %eax, 8(%rdx)

addl $1, %eax

addq $4, %rdx

cmpl $1000001, %eax

jne .L2

movl $2, %ebx

.p2align 4,,10

.p2align 3

.L5:

movslq %ebx, %rcx

movl (%r12,%rcx,4), %edx

testl %edx, %edx

je .L3

movl $2, %eax

jmp .L4

.p2align 4,,10

.p2align 3

.L13:

addl $1, %eax

movslq %edx, %rdx

cmpl $1000000, %eax

movl $0, (%r12,%rdx,4)

je .L3

movl (%r12,%rcx,4), %edx

.L4:

imull %eax, %edx

cmpl $1000000, %edx

jle .L13

.L3:

addl $1, %ebx

movl %ebx, %eax

imull %ebx, %eax

cmpl $1000000, %eax

jle .L5

xorl %ebx, %ebx

jmp .L7

.p2align 4,,10

.p2align 3

.L6:

addq $4, %rbx

cmpq $3999996, %rbx

je .L14

.L7:

movl 8(%r12,%rbx), %esi

testl %esi, %esi

je .L6

xorl %eax, %eax

movl $.LC0, %edi

addq $4, %rbx

call printf

cmpq $3999996, %rbx

jne .L7

.L14:

leaq -16(%rbp), %rsp

xorl %eax, %eax

popq %rbx

popq %r12

leave

.cfi_def_cfa 7, 8

ret

.cfi_endproc

.LFE11:

.size main, .-main

.ident "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-18)"

.section .note.GNU-stack,"",@progbits

$ wc -l sieve.c sieve.s

39 sieve.c # number of line of c code

93 sieve.s # number of assembler instructions

$ gcc -c -g -Wa,-a,-ad s.c > sieve.lst

$ cat sieve.lst

GAS LISTING /tmp/cccMtKmE.s page 1

1 .file "s.c"

9 .Ltext0:

10 .section .rodata

11 .LC0:

12 0000 25640A00 .string "%d\n"

13 .text

14 .globl main

16 main:

17 .LFB0:

18 .file 1 "s.c"

0:s.c **** #include <stdio.h>

1:s.c ****

2:s.c **** int main()

3:s.c **** {

19 .loc 1 4 0

20 .cfi_startproc

21 0000 55 pushq %rbp

22 .cfi_def_cfa_offset 16

23 .cfi_offset 6, -16

24 0001 4889E5 movq %rsp, %rbp

25 .cfi_def_cfa_register 6

26 0004 53 pushq %rbx

27 0005 4883EC28 subq $40, %rsp

28 .loc 1 4 0

29 0009 4889E0 movq %rsp, %rax

30 000c 4889C3 movq %rax, %rbx

31 .cfi_offset 3, -24

4:s.c **** int number,i,j;

5:s.c **** number=1000000;

32 .loc 1 6 0

33 000f C745E440 movl $1000000, -28(%rbp)

33 420F00

6:s.c ****

7:s.c **** int primes[number+1];

34 .loc 1 8 0

35 0016 8B45E4 movl -28(%rbp), %eax

36 0019 83C001 addl $1, %eax

37 001c 4863D0 movslq %eax, %rdx

38 001f 4883EA01 subq $1, %rdx

39 0023 488955D0 movq %rdx, -48(%rbp)

40 0027 4898 cltq

41 0029 48C1E002 salq $2, %rax

42 002d 4883C00F addq $15, %rax

43 0031 4883C00F addq $15, %rax

44 0035 48C1E804 shrq $4, %rax

45 0039 48C1E004 salq $4, %rax

46 003d 4829C4 subq %rax, %rsp

47 0040 4889E0 movq %rsp, %rax

48 0043 4883C00F addq $15, %rax

49 0047 48C1E804 shrq $4, %rax

50 004b 48C1E004 salq $4, %rax

51 004f 488945D8 movq %rax, -40(%rbp)

8:s.c ****

9:s.c **** //populating array with naturals numbers

10:s.c **** for(i = 2; i<=number; i++)

52 .loc 1 11 0

53 0053 C745E802 movl $2, -24(%rbp)

GAS LISTING /tmp/cccMtKmE.s page 2

53 000000

54 005a EB14 jmp .L2

55 .L3:

11:s.c **** primes[i] = i;

56 .loc 1 12 0

57 005c 8B55E8 movl -24(%rbp), %edx

58 005f 488B45D8 movq -40(%rbp), %rax

59 0063 4863D2 movslq %edx, %rdx

60 0066 8B4DE8 movl -24(%rbp), %ecx

61 0069 890C90 movl %ecx, (%rax,%rdx,4)

11:s.c **** primes[i] = i;

62 .loc 1 11 0

63 006c 8345E801 addl $1, -24(%rbp)

64 .L2:

65 0070 8B45E8 movl -24(%rbp), %eax

66 0073 3B45E4 cmpl -28(%rbp), %eax

67 0076 7EE4 jle .L3

12:s.c ****

13:s.c **** i = 2;

68 .loc 1 14 0

69 0078 C745E802 movl $2, -24(%rbp)

69 000000

14:s.c **** while ((i*i) <= number)

70 .loc 1 15 0

71 007f EB64 jmp .L4

72 .L9:

15:s.c **** {

16:s.c **** if (primes[i] != 0)

73 .loc 1 17 0

74 0081 8B55E8 movl -24(%rbp), %edx

75 0084 488B45D8 movq -40(%rbp), %rax

76 0088 4863D2 movslq %edx, %rdx

77 008b 8B0490 movl (%rax,%rdx,4), %eax

78 008e 85C0 testl %eax, %eax

79 0090 744F je .L5

17:s.c **** {

18:s.c **** for(j=2; j<number; j++)

80 .loc 1 19 0

81 0092 C745EC02 movl $2, -20(%rbp)

81 000000

82 0099 EB3B jmp .L6

83 .L8:

19:s.c **** {

20:s.c **** if (primes[i]*j > number)

84 .loc 1 21 0

85 009b 8B55E8 movl -24(%rbp), %edx

86 009e 488B45D8 movq -40(%rbp), %rax

87 00a2 4863D2 movslq %edx, %rdx

88 00a5 8B0490 movl (%rax,%rdx,4), %eax

89 00a8 0FAF45EC imull -20(%rbp), %eax

90 00ac 3B45E4 cmpl -28(%rbp), %eax

91 00af 7F2F jg .L14

92 .L7:

21:s.c **** break;

22:s.c **** else

23:s.c **** // Instead of deleteing , making elemnets 0

24:s.c **** primes[primes[i]*j]=0;

GAS LISTING /tmp/cccMtKmE.s page 3

93 .loc 1 25 0

94 00b1 8B55E8 movl -24(%rbp), %edx

95 00b4 488B45D8 movq -40(%rbp), %rax

96 00b8 4863D2 movslq %edx, %rdx

97 00bb 8B0490 movl (%rax,%rdx,4), %eax

98 00be 89C2 movl %eax, %edx

99 00c0 0FAF55EC imull -20(%rbp), %edx

100 00c4 488B45D8 movq -40(%rbp), %rax

101 00c8 4863D2 movslq %edx, %rdx

102 00cb C7049000 movl $0, (%rax,%rdx,4)

102 000000

19:s.c **** {

103 .loc 1 19 0

104 00d2 8345EC01 addl $1, -20(%rbp)

105 .L6:

106 00d6 8B45EC movl -20(%rbp), %eax

107 00d9 3B45E4 cmpl -28(%rbp), %eax

108 00dc 7CBD jl .L8

109 00de EB01 jmp .L5

110 .L14:

22:s.c **** break;

111 .loc 1 22 0

112 00e0 90 nop

113 .L5:

25:s.c **** }

26:s.c **** }

27:s.c **** i++;

114 .loc 1 28 0

115 00e1 8345E801 addl $1, -24(%rbp)

116 .L4:

15:s.c **** while ((i*i) <= number)

117 .loc 1 15 0

118 00e5 8B45E8 movl -24(%rbp), %eax

119 00e8 0FAF45E8 imull -24(%rbp), %eax

120 00ec 3B45E4 cmpl -28(%rbp), %eax

121 00ef 7E90 jle .L9

28:s.c **** }

29:s.c ****

30:s.c **** for(i = 2; i<=number; i++)

122 .loc 1 31 0

123 00f1 C745E802 movl $2, -24(%rbp)

123 000000

124 00f8 EB36 jmp .L10

125 .L12:

31:s.c **** {

32:s.c **** //If number is not 0 then it is prime

33:s.c **** if (primes[i]!=0)

126 .loc 1 34 0

127 00fa 8B55E8 movl -24(%rbp), %edx

128 00fd 488B45D8 movq -40(%rbp), %rax

129 0101 4863D2 movslq %edx, %rdx

130 0104 8B0490 movl (%rax,%rdx,4), %eax

131 0107 85C0 testl %eax, %eax

132 0109 7421 je .L11

34:s.c **** printf("%d\n",primes[i]);

133 .loc 1 35 0

134 010b 8B55E8 movl -24(%rbp), %edx

GAS LISTING /tmp/cccMtKmE.s page 4

135 010e 488B45D8 movq -40(%rbp), %rax

136 0112 4863D2 movslq %edx, %rdx

137 0115 8B1490 movl (%rax,%rdx,4), %edx

138 0118 B8000000 movl $.LC0, %eax

138 00

139 011d 89D6 movl %edx, %esi

140 011f 4889C7 movq %rax, %rdi

141 0122 B8000000 movl $0, %eax

141 00

142 0127 E8000000 call printf

142 00

143 .L11:

31:s.c **** {

144 .loc 1 31 0

145 012c 8345E801 addl $1, -24(%rbp)

146 .L10:

147 0130 8B45E8 movl -24(%rbp), %eax

148 0133 3B45E4 cmpl -28(%rbp), %eax

149 0136 7EC2 jle .L12

35:s.c **** }

36:s.c ****

37:s.c **** return 0;

150 .loc 1 38 0

151 0138 B8000000 movl $0, %eax

151 00

152 013d 4889DC movq %rbx, %rsp

38:s.c **** }

153 .loc 1 39 0

154 0140 488B5DF8 movq -8(%rbp), %rbx

155 0144 C9 leave

156 .cfi_def_cfa 7, 8

157 0145 C3 ret

158 .cfi_endproc

159 .LFE0:

161 .Letext0:

GAS LISTING /tmp/cccMtKmE.s page 5

DEFINED SYMBOLS

*ABS*:0000000000000000 s.c

/tmp/cccMtKmE.s:16 .text:0000000000000000 main

UNDEFINED SYMBOLS

printf

$ sudo yum install -y perf
$ perf stat ./sieve > /dev/null

Performance counter stats for './sieve':

83.077618 task-clock (msec) # 0.991 CPUs utilized
4 context-switches # 0.048 K/sec
0 cpu-migrations # 0.000 K/sec
592 page-faults # 0.007 M/sec
120,181,719 cycles # 1.447 GHz (49.56%)
0 stalled-cycles-frontend (49.65%)
0 stalled-cycles-backend # 0.00% backend cycles idle (51.58%)
0 instructions # 0.00 insn per cycle (51.60%)
0 branches # 0.000 K/sec (50.88%)
0 branch-misses # 0.00% of all branches (50.20%)

0.083848321 seconds time elapsed

$ time ./s > /dev/null

real 0m0.086s

user 0m0.080s

sys 0m0.006s

BogoMIPS - Bogus unscientific MIPS:
https://en.wikipedia.org/wiki/BogoMips
In my Linux VM I had:
Total of 12 processors activated (55199.95 BogoMIPS)
As each vCPU is one thread we have 12 vCPUs = 6 cores. 6 cores x 5 GIPS/GHz x 2.3 GHz = 69 GIPS. It is not a bad ballpark estimate.

How do other languages compare in performance:
https://benchmarksgame-team.pages.debian.net/benchmarksgame/
http://attractivechaos.github.io/plb/

Zoran's Blog

Wednesday, 10 April 2019

Instructions per Second on current CPUs and how does this translate to high level languages

No comments:

Post a Comment