/*
	Measure latency of PUSHA and POPA instructions, $Revision: 1.5 $

	Author: Wojciech Muła
	e-mail: wojciech_mula@poczta.onet.pl
	www:    http://0x80.pl

	License: public domain

	initial release 28-05-2008, last update $Date: 2008-06-27 19:04:16 $

	----------------------------------------------------------------------

	Program can measure latency of single PUSHA instruction or pair
	PUSHA/POPA.  It also includes procedures that use individual PUSH
	and POP perform the same operations as PUSHA/POPA.

	Compilation:

		gcc -O3 pushapopa-test.c -o your_favorite_name

	Usage:

		run program without argument to read help

*/
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>

uint64_t rdtsc() {
	union {
		uint64_t val;
		uint32_t dwords[2];
	} result;

	__asm__ volatile("rdtsc" : "=d" (result.dwords[1]), "=a" (result.dwords[0]));
	return result.val;
}


uint32_t dummy;
uint32_t temp;
uint32_t ESP;


void pusha(long int iters) {
	__asm__ volatile ("movl %esp, (ESP)\n");
	__asm__ volatile(
		"0:				\n"
		"	pushal			\n"
		"	movl (ESP), %%esp	\n"
		"	subl    $1, %%ecx	\n"
		"	jnz     0b		\n"
		: "=c" (dummy)
		: "c" (iters)
	);
}


void push(long int iters) {
	__asm__ volatile ("movl %esp, (ESP)\n");
	__asm__ volatile(
		"0:			\n"
		
		"movl	%%esp, (temp)	\n"
		"pushl	%%eax		\n"
		"pushl	%%ecx		\n"
		"pushl	%%edx		\n"
		"pushl	%%ebx		\n"
		"pushl	temp		\n"
		"pushl	%%ebp		\n"
		"pushl	%%esi		\n"
		"pushl	%%edi		\n"
		"movl	(ESP), %%esp	\n"
		
		"subl   $1, %%ecx	\n"
		"jnz    0b		\n"

		: "=c" (dummy)
		: "c" (iters)
		);
}


void pusha_popa(long int iters) {
	__asm__ volatile(
		"0:		\n"
		"pushal		\n"
		"popal		\n"
		"subl $1, %%ecx	\n"
		"jnz 0b		\n"
		: "=c" (dummy)
		: "c" (iters)
	);
}



void push_pop(long int iters) {
	__asm__ volatile(
		"0:			\n"
		// pusha
		"movl	%%esp, (temp)	\n"
		"pushl	%%eax	\n"
		"pushl	%%ecx	\n"
		"pushl	%%edx	\n"
		"pushl	%%ebx	\n"
		"pushl	temp	\n"
		"pushl	%%ebp	\n"
		"pushl	%%esi	\n"
		"pushl	%%edi	\n"
		
		// popa
		"popl	%%edi	\n"
		"popl	%%esi	\n"
		"popl	%%ebp	\n"
		"addl	$4, %%esp	\n"
		"popl	%%ebx	\n"
		"popl	%%edx	\n"
		"popl	%%ecx	\n"
		"popl	%%eax	\n"

		"subl   $1, %%ecx	\n"
		"jnz	0b		\n"
		: "=c" (dummy)
		: "c" (iters)
	);
}


void help(char* progname) {
	puts("Program meausre latency of PUSHA and POPA instructions");

	puts("\nusage:");
	printf("\t%s pusha|pushapopa|push|pushpop iter-count\n", progname);

	puts("\nwhere:");
	puts("- pusha     - single PUSHA instruction");
	puts("- pushapopa - pair PUSHA and POPA instructions");
	puts("- push      - PUSH sequence that perform same operation as PUSHA");
	puts("- pushpop   - PUSH/POP sequence that perform same operation as PUSHA/POPA");
	exit(1);
}


int main(int argc, char* argv[]) {

	// parsing cmd line
	if (argc != 3)
		help(argv[0]);

	int fun = -1;
	long int iters = 0;

	iters = strtol(argv[2], NULL, 10);
	if (iters <= 0)
		help(argv[0]);

	if (strcasecmp(argv[1], "pusha") == 0)
		fun = 0;
	else
	if (strcasecmp(argv[1], "push") == 0)
		fun = 1;
	else
	if (strcasecmp(argv[1], "pushapopa") == 0)
		fun = 100;
	else
	if (strcasecmp(argv[1], "pushpop") == 0)
		fun = 200;
	else
		help(argv[0]);

	// measure loop cost
	uint64_t t1_loop, t2_loop;

	switch (fun) {
		case 0:
		case 1:
			t1_loop = rdtsc();
			__asm__ volatile (
				"mov %%esp, (ESP)	\n"
				"0:			\n"
				"mov (ESP), %%esp	\n"
				"subl   $1, %%ecx	\n"
				"jnz    0b		\n"
				: "=c" (dummy)
				: "c" (iters)
			);
			t2_loop = rdtsc();
			break;

		case 100:
		case 200:
			t1_loop = rdtsc();
			__asm__ volatile (
				"0:			\n"
				"subl   $1, %%ecx	\n"
				"jnz    0b		\n"
				: "=c" (dummy)
				: "c" (iters)
			);
			t2_loop = rdtsc();
			break;

		default:
			return 2;
	}

	// testing
	uint64_t t1, t2;
	
	t1 = rdtsc();
	switch (fun) {
		case 0:
			printf("pushal, iters=%ld\n", iters);
			pusha(iters);
			break;

		case 1:
			printf("sequence pushl, iters=%ld\n", iters);
			push(iters);
			break;

		case 100:
			printf("pushal/popal, iters=%ld\n", iters);
			pusha_popa(iters);
			break;

		case 200:
			printf("sequence pushl/popl, iters=%ld\n", iters);
			push_pop(iters);
			break;
	}
	t2 = rdtsc();

	uint64_t dt, dt_loop;

	dt = t2 - t1;
	dt_loop = t2_loop - t1_loop;

	printf("cycles: %lld - %lld = %lld\n", dt, dt_loop, dt - dt_loop);
	printf("cycles per iters = %lld\n", (dt - dt_loop)/iters);

	return 0;
}

