// How optimization and compilation options change the code. // The book I mentioned today: Bob Neveln, "Linux Assembly Language Programming", // (https://www.amazon.com/Linux-Assembly-Language-Programming-Neveln/dp/0130879401) // Amazon has good deals on used copies of this book (essentially, shipping costs). $ cat hello.c #include int main() { int i = 10; while( i >= 0 ){ puts( "Hello" ); i = i - 1; } return 42; } $ gcc -o hello hello.c $ gobjdump -d hello hello: file format mach-o-x86-64 Disassembly of section .text: 0000000100000f20 <_main>: 100000f20: 55 push %rbp 100000f21: 48 89 e5 mov %rsp,%rbp 100000f24: 48 83 ec 10 sub $0x10,%rsp 100000f28: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%rbp) 100000f2f: c7 45 f8 0a 00 00 00 movl $0xa,-0x8(%rbp) // i := 10 100000f36: 81 7d f8 00 00 00 00 cmpl $0x0,-0x8(%rbp) 100000f3d: 0f 8c 20 00 00 00 jl 100000f63 <_main+0x43> 100000f43: 48 8d 3d 44 00 00 00 lea 0x44(%rip),%rdi # 100000f8e <_main+0x6e> // "hello" 100000f4a: e8 1f 00 00 00 callq 100000f6e <_main+0x4e> // puts 100000f4f: 8b 4d f8 mov -0x8(%rbp),%ecx // load i from stack frame 100000f52: 81 e9 01 00 00 00 sub $0x1,%ecx // decrement i 100000f58: 89 4d f8 mov %ecx,-0x8(%rbp) // store i into stack frame 100000f5b: 89 45 f4 mov %eax,-0xc(%rbp) // store return value of puts (see "man puts") 100000f5e: e9 d3 ff ff ff jmpq 100000f36 <_main+0x16> // loop back 100000f63: b8 2a 00 00 00 mov $0x2a,%eax 100000f68: 48 83 c4 10 add $0x10,%rsp 100000f6c: 5d pop %rbp 100000f6d: c3 retq // Next level of optimization: local var i is stored in register, costly // memory loads and stores from/to the stack frame are eliminated $ gcc -o hello1 -O1 hello.c $ gobjdump -d hello1 hello1: file format mach-o-x86-64 Disassembly of section .text: 0000000100000f40 <_main>: 100000f40: 55 push %rbp 100000f41: 48 89 e5 mov %rsp,%rbp 100000f44: 41 56 push %r14 // R14 is caller-saved: we are expected to restore it 100000f46: 53 push %rbx 100000f47: bb 0b 00 00 00 mov $0xb,%ebx // "i" is now held in EBX & initialized to 11, not 10! 100000f4c: 4c 8d 35 47 00 00 00 lea 0x47(%rip),%r14 # 100000f9a <_main+0x5a> // "hello" 100000f53: 66 66 66 66 2e 0f 1f data16 data16 data16 nopw %cs:0x0(%rax,%rax,1) // multi-byte NOP (13 bytes) 100000f5a: 84 00 00 00 00 00 100000f60: 4c 89 f7 mov %r14,%rdi // start of loop, 16-byte aligned address 100000f63: e8 10 00 00 00 callq 100000f78 <_main+0x38> // puts 100000f68: ff cb dec %ebx // decrement "i" 100000f6a: 85 db test %ebx,%ebx // test for 0 100000f6c: 7f f2 jg 100000f60 <_main+0x20> // loop back, 16-byte aligned is better for branch prediction 100000f6e: b8 2a 00 00 00 mov $0x2a,%eax 100000f73: 5b pop %rbx 100000f74: 41 5e pop %r14 // restoring R14 to what it was in the previous function's context 100000f76: 5d pop %rbp 100000f77: c3 retq // The next optimization level applies "loop unrolling". Branches are // expensive, because they interfere with processor's pipelining of instructions; // when possible (i.e., the number of iterations is a constant known at // compile time) loops are unrolled into simple repetition of loop body code: $ gcc -o hello2 -O2 hello.c $ gobjdump -d hello2 | less hello2: file format mach-o-x86-64 Disassembly of section .text: 0000000100000f00 <_main>: 100000f00: 55 push %rbp 100000f01: 48 89 e5 mov %rsp,%rbp 100000f04: 53 push %rbx 100000f05: 50 push %rax 100000f06: 48 8d 1d 85 00 00 00 lea 0x85(%rip),%rbx # 100000f92 <_main+0x92> 100000f0d: 48 89 df mov %rbx,%rdi 100000f10: e8 5d 00 00 00 callq 100000f72 <_main+0x72> 100000f15: 48 89 df mov %rbx,%rdi 100000f18: e8 55 00 00 00 callq 100000f72 <_main+0x72> 100000f1d: 48 89 df mov %rbx,%rdi 100000f20: e8 4d 00 00 00 callq 100000f72 <_main+0x72> 100000f25: 48 89 df mov %rbx,%rdi 100000f28: e8 45 00 00 00 callq 100000f72 <_main+0x72> 100000f2d: 48 89 df mov %rbx,%rdi 100000f30: e8 3d 00 00 00 callq 100000f72 <_main+0x72> 100000f35: 48 89 df mov %rbx,%rdi 100000f38: e8 35 00 00 00 callq 100000f72 <_main+0x72> 100000f3d: 48 89 df mov %rbx,%rdi 100000f40: e8 2d 00 00 00 callq 100000f72 <_main+0x72> 100000f45: 48 89 df mov %rbx,%rdi 100000f48: e8 25 00 00 00 callq 100000f72 <_main+0x72> 100000f4d: 48 89 df mov %rbx,%rdi 100000f50: e8 1d 00 00 00 callq 100000f72 <_main+0x72> 100000f55: 48 89 df mov %rbx,%rdi 100000f58: e8 15 00 00 00 callq 100000f72 <_main+0x72> 100000f5d: 48 89 df mov %rbx,%rdi 100000f60: e8 0d 00 00 00 callq 100000f72 <_main+0x72> 100000f65: b8 2a 00 00 00 mov $0x2a,%eax 100000f6a: 48 83 c4 08 add $0x8,%rsp 100000f6e: 5b pop %rbx 100000f6f: 5d pop %rbp 100000f70: c3 retq $ gcc -o hello3 -O3 hello.c $ gobjdump -d hello3 | less $ gcc -o hello4 -O4 hello.c clang: warning: -O4 is equivalent to -O3 $ gcc -o hello4 -O5 hello.c warning: optimization level '-O5' is not supported; using '-O3' instead 1 warning generated. // Look at the -f options. You can change so many things about compiler's behavior! $ gcc --help | less // Compiling without the frame pointer. Now all contents of the stack // frame, including the local variable i, are accessed via offsets // off of RSP: $ gcc -Wall -o hello-nofp -fomit-frame-pointer hello.c $ gobjdump -d hello-nofp hello: file format mach-o-x86-64 Disassembly of section .text: 0000000100000f20 <_main>: 100000f20: 48 83 ec 18 sub $0x18,%rsp 100000f24: c7 44 24 14 00 00 00 movl $0x0,0x14(%rsp) 100000f2b: 00 100000f2c: c7 44 24 10 0a 00 00 movl $0xa,0x10(%rsp) 100000f33: 00 100000f34: 81 7c 24 10 00 00 00 cmpl $0x0,0x10(%rsp) 100000f3b: 00 100000f3c: 0f 8c 23 00 00 00 jl 100000f65 <_main+0x45> 100000f42: 48 8d 3d 49 00 00 00 lea 0x49(%rip),%rdi # 100000f92 <_main+0x72> // "hello" 100000f49: e8 22 00 00 00 callq 100000f70 <_main+0x50> // puts 100000f4e: 8b 4c 24 10 mov 0x10(%rsp),%ecx // load i 100000f52: 81 e9 01 00 00 00 sub $0x1,%ecx // decrement i by 1 100000f58: 89 4c 24 10 mov %ecx,0x10(%rsp) // store i 100000f5c: 89 44 24 0c mov %eax,0xc(%rsp) // return value of puts saved in frame 100000f60: e9 cf ff ff ff jmpq 100000f34 <_main+0x14> 100000f65: b8 2a 00 00 00 mov $0x2a,%eax 100000f6a: 48 83 c4 18 add $0x18,%rsp 100000f6e: c3 retq $ gobjdump -d hello | less $ cat struct.c #include /* Use this simple program to see how struct member accesses are compiled. */ struct item { int num; char *name; }; struct item it[3] = { {1, "foo"}, {2, "bar"}, {3, "baz"} }; int main() { int k; struct item *p; for(k = 1; k < 4; k++) printf("item %d: %s\n", it[k-1].num, it[k-1].name); /* comment this out first */ //p = &it[0]; /* recall that [] has higher precedence than & (address-of); // cf. http://en.cppreference.com/w/c/language/operator_precedence */ //for(k = 1; k < 4; k++){ // printf("item %d again: %s\n", p->num, p->name); // p++; //} return 42; } $ gcc -Wall -o struct struct.c struct.c:17:19: warning: unused variable 'p' [-Wunused-variable] struct item *p; ^ 1 warning generated. $ ./struct item 1: foo item 2: bar item 3: baz $ gobjdump -d struct // this gives us a few annotations of the disassembled code that lldb below misses, // such as the address of the global "it" struct: file format mach-o-x86-64 Disassembly of section .text: 0000000100000ee0 <_main>: 100000ee0: 55 push %rbp 100000ee1: 48 89 e5 mov %rsp,%rbp 100000ee4: 48 83 ec 20 sub $0x20,%rsp 100000ee8: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%rbp) 100000eef: c7 45 f8 01 00 00 00 movl $0x1,-0x8(%rbp) 100000ef6: 81 7d f8 04 00 00 00 cmpl $0x4,-0x8(%rbp) 100000efd: 0f 8d 57 00 00 00 jge 100000f5a <_main+0x7a> 100000f03: 48 8d 3d 88 00 00 00 lea 0x88(%rip),%rdi # 100000f92 <_main+0xb2> 100000f0a: 48 8d 05 0f 01 00 00 lea 0x10f(%rip),%rax # 100001020 <_it> 100000f11: 8b 4d f8 mov -0x8(%rbp),%ecx 100000f14: 81 e9 01 00 00 00 sub $0x1,%ecx 100000f1a: 48 63 d1 movslq %ecx,%rdx 100000f1d: 48 c1 e2 04 shl $0x4,%rdx 100000f21: 48 89 c6 mov %rax,%rsi 100000f24: 48 01 d6 add %rdx,%rsi 100000f27: 8b 36 mov (%rsi),%esi 100000f29: 8b 4d f8 mov -0x8(%rbp),%ecx 100000f2c: 81 e9 01 00 00 00 sub $0x1,%ecx 100000f32: 48 63 d1 movslq %ecx,%rdx 100000f35: 48 c1 e2 04 shl $0x4,%rdx 100000f39: 48 01 d0 add %rdx,%rax 100000f3c: 48 8b 50 08 mov 0x8(%rax),%rdx 100000f40: b0 00 mov $0x0,%al 100000f42: e8 1f 00 00 00 callq 100000f66 <_main+0x86> 100000f47: 89 45 ec mov %eax,-0x14(%rbp) 100000f4a: 8b 45 f8 mov -0x8(%rbp),%eax 100000f4d: 05 01 00 00 00 add $0x1,%eax 100000f52: 89 45 f8 mov %eax,-0x8(%rbp) 100000f55: e9 9c ff ff ff jmpq 100000ef6 <_main+0x16> 100000f5a: b8 2a 00 00 00 mov $0x2a,%eax 100000f5f: 48 83 c4 20 add $0x20,%rsp 100000f63: 5d pop %rbp 100000f64: c3 retq // recompiling with debugging support, for a slightly more verbose disasm: $ gcc -g -Wall -o struct struct.c struct.c:17:19: warning: unused variable 'p' [-Wunused-variable] struct item *p; ^ 1 warning generated. // The struct item holds an integer (4 bytes) and a pointer (8 bytes == 64 bits). // So you might expect the size of struct item to be 12 bytes---but instead it's 16 bytes! // The LLVM compiler lays out the structure to preserve the order of its fields, // and also to align the pointer at 8-byte boundary, for access efficiency. // Hence there are 4 unused bytes between the integer and the pointer fields, // and the array of struct item is walked in 16-byte steps. // In the following code, note how the individual elements of the global array are found, // and, once each element (a struct item) is found, how its fields are loaded. $ lldb struct (lldb) target create "struct" 2016-09-28 17:04:58.837 lldb[2242:114097] Metadata.framework [Error]: couldn't get the client port Current executable set to 'struct' (x86_64). (lldb) disas -n main struct`main: struct[0x100000ee0] <+0>: pushq %rbp struct[0x100000ee1] <+1>: movq %rsp, %rbp struct[0x100000ee4] <+4>: subq $0x20, %rsp struct[0x100000ee8] <+8>: movl $0x0, -0x4(%rbp) struct[0x100000eef] <+15>: movl $0x1, -0x8(%rbp) // k := 1 struct[0x100000ef6] <+22>: cmpl $0x4, -0x8(%rbp) // start of the for-loop struct[0x100000efd] <+29>: jge 0x100000f5a ; <+122> at struct.c:19 struct[0x100000f03] <+35>: leaq 0x88(%rip), %rdi ; "item %d: %s\n" struct[0x100000f0a] <+42>: leaq 0x10f(%rip), %rax // this is where the global "it" array starts struct[0x100000f11] <+49>: movl -0x8(%rbp), %ecx // load "k" struct[0x100000f14] <+52>: subl $0x1, %ecx // "k-1" struct[0x100000f1a] <+58>: movslq %ecx, %rdx // sign-extend "k-1" into RDX -- we want to make a pointer (64bit) struct[0x100000f1d] <+61>: shlq $0x4, %rdx // (k-1)*16 struct[0x100000f21] <+65>: movq %rax, %rsi // &it[0] struct[0x100000f24] <+68>: addq %rdx, %rsi // &it[0] + (k-1)*16 struct[0x100000f27] <+71>: movl (%rsi), %esi // load 4 bytes from the above address; it's "num", printf's 2nd arg struct[0x100000f29] <+73>: movl -0x8(%rbp), %ecx // repeat this calculation for "name" struct[0x100000f2c] <+76>: subl $0x1, %ecx struct[0x100000f32] <+82>: movslq %ecx, %rdx struct[0x100000f35] <+85>: shlq $0x4, %rdx struct[0x100000f39] <+89>: addq %rdx, %rax struct[0x100000f3c] <+92>: movq 0x8(%rax), %rdx // loading "name", an 8-byte pointer value (printf's 3rd arument) struct[0x100000f40] <+96>: movb $0x0, %al // printf takes an extra argument, as per varargs convention struct[0x100000f42] <+98>: callq 0x100000f66 ; symbol stub for: printf struct[0x100000f47] <+103>: movl %eax, -0x14(%rbp) // save return value of printf struct[0x100000f4a] <+106>: movl -0x8(%rbp), %eax // load "k" struct[0x100000f4d] <+109>: addl $0x1, %eax // increment "k" struct[0x100000f52] <+114>: movl %eax, -0x8(%rbp) // store "k" struct[0x100000f55] <+117>: jmp 0x100000ef6 ; <+22> at struct.c:19 // loop back struct[0x100000f5a] <+122>: movl $0x2a, %eax struct[0x100000f5f] <+127>: addq $0x20, %rsp struct[0x100000f63] <+131>: popq %rbp struct[0x100000f64] <+132>: retq // Now compile this with optimizations and see what changes! // For parsing binary files that have no padding in their structures, // or of network messages (such as TCP/IP packets), we need structures // of exact size and layout, without padding/alignment optimizations as above. // For such cases, compilers support extensions that tell them to "pack" // structs (and the code that accesses them) as tightly as possible. $ cat struct-packed.c #include /* Use this simple program to see how struct member accesses are compiled. */ struct item { int num; char *name; }__attribute__((packed)); struct item it[3] = { {1, "foo"}, {2, "bar"}, {3, "baz"} }; int main() { int k; for(k = 1; k < 4; k++) printf("item %d: %s\n", it[k-1].num, it[k-1].name); return 42; } $ gcc -Wall -o struct-packed struct-packed.c // Note the changes: struct item is now 12 bytes long (0xc): $ gobjdump -d struct-packed struct-packed: file format mach-o-x86-64 Disassembly of section .text: 0000000100000ed0 <_main>: 100000ed0: 55 push %rbp 100000ed1: 48 89 e5 mov %rsp,%rbp 100000ed4: 48 83 ec 20 sub $0x20,%rsp 100000ed8: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%rbp) 100000edf: c7 45 f8 01 00 00 00 movl $0x1,-0x8(%rbp) 100000ee6: 81 7d f8 04 00 00 00 cmpl $0x4,-0x8(%rbp) 100000eed: 0f 8d 5d 00 00 00 jge 100000f50 <_main+0x80> 100000ef3: 48 8d 3d 90 00 00 00 lea 0x90(%rip),%rdi # 100000f8a <_main+0xba> // "item %d: %s\n" 100000efa: 48 8d 05 1f 01 00 00 lea 0x11f(%rip),%rax # 100001020 <_it> 100000f01: 8b 4d f8 mov -0x8(%rbp),%ecx // load "k" 100000f04: 81 e9 01 00 00 00 sub $0x1,%ecx 100000f0a: 48 63 d1 movslq %ecx,%rdx 100000f0d: 48 69 d2 0c 00 00 00 imul $0xc,%rdx,%rdx // rdx *= 12 100000f14: 48 89 c6 mov %rax,%rsi 100000f17: 48 01 d6 add %rdx,%rsi // &it[0] + (k-1)*12 100000f1a: 8b 36 mov (%rsi),%esi // load "num" where 2nd arg to printf must go 100000f1c: 8b 4d f8 mov -0x8(%rbp),%ecx // same again 100000f1f: 81 e9 01 00 00 00 sub $0x1,%ecx 100000f25: 48 63 d1 movslq %ecx,%rdx 100000f28: 48 69 d2 0c 00 00 00 imul $0xc,%rdx,%rdx 100000f2f: 48 01 d0 add %rdx,%rax 100000f32: 48 8b 50 04 mov 0x4(%rax),%rdx // load "name" where 3rd arg to printf must go 100000f36: b0 00 mov $0x0,%al 100000f38: e8 1f 00 00 00 callq 100000f5c <_main+0x8c> // call printf 100000f3d: 89 45 ec mov %eax,-0x14(%rbp) 100000f40: 8b 45 f8 mov -0x8(%rbp),%eax 100000f43: 05 01 00 00 00 add $0x1,%eax 100000f48: 89 45 f8 mov %eax,-0x8(%rbp) 100000f4b: e9 96 ff ff ff jmpq 100000ee6 <_main+0x16> 100000f50: b8 2a 00 00 00 mov $0x2a,%eax 100000f55: 48 83 c4 20 add $0x20,%rsp 100000f59: 5d pop %rbp 100000f5a: c3 retq // Note that in C, &it[0] + 1 will give you the pointer to it[1], i.e., 16/12 bytes off // the start of the array. In C pointer arithmetic, when you add an integer to // a pointer, p + n, you get the address n*(the size of type that p points to) off of p. // Experiment with a[i], *(a+i), and i[a] for different types of an array a.