/*
*
* 23. TESTING SPEED  (all processors)
* ===================================
* The Pentium family or processors have an internal 64 bit clock counter
* which can be read into EDX:EAX using the instruction RDTSC (read time
* stamp counter). This is very useful for testing exactly how many clock
* cycles a piece of code takes.
*
* The program below is useful for measuring the number of clock cycles a piece
* of code takes. The program executes the code to test 10 times and stores the
* 10 clock counts. The program can be used in both 16 and 32 bit mode on the
* PPlain and PMMX:
*
* The 'filler' instructions before and after the piece of code to test are
* are included in order to get consistent results on the PPlain.
* The CLD is a non-pairable instruction which has been inserted to
* make sure the pairing is the same the first time as the subsequent times.
* The eight NOP instructions are inserted to prevent any prefixes in the code
* to test to be decoded in the shadow of the preceding instructions on the
* PPlain. Single byte instructions are used here to obtain the same pairing
* the first time as the subsequent times. The CLC after the code to test is
* a non-pairable instruction which has a shadow under which the 0FH prefix
* of the RDTSC can be decoded so that it is independent of any shadowing
* effect from the code to test on the PPlain.
*
* On The PMMX you may want to insert  XOR EAX,EAX / CPUID  before the
* instructions to test if you want the FIFO instruction buffer to be
* empty, or some time-consuming instruction (f.ex. CLI or AAD) if you
* want the FIFO buffer to be full.
*
* On the PPro and PII you have to put in a serializing instruction like
* CPUID before and after each RDTSC to prevent it from executing in parallel
* with anything else. (CPUID is a serializing instruction which means that
* it flushes the pipeline and waits for all pending operations to finish
* before proceeding. This is useful for testing purposes. CPUID has no
* shadow under which prefixes of subsequent instructions can decode.)
*
* The RDTSC instruction cannot execute in virtual mode on the PPlain and
* PMMX, so if you are running DOS programs you must run in real mode. (Press
* F8 while booting and select 'safe mode command prompt only' or 'bypass
* startup files').
*
* The Pentium processors have special performance monitor counters which can
* count events such as cache misses, misalignments, AGI stalls, etc. Details
* about how to use the performance monitor counters are not covered by this
* manual but can be found in the MMX technology developer's manual.
*
*/

#include <stdio.h>
#include <stdlib.h>

#define ITER 10                 /* number of iterations */

int counter = 0;                /* loop counter */
int tics = 0;                   /* temporary storage of clock */
int resultlist[ITER];           /* list of test results */


main()
{
       int i;

       asm("
       .equ ITER, 10           # number of iterations
       .equ OVERHEAD, 17       # 15 for PPlain, 17 for PMMX

       .data

       .align 32
var1:   .int 0,0
var2:   .int 0,0
dummy1: .int 0,0
dummy2: .int 0,0
var3:   .int 0,0
var4:   .int 0,0
dummy3: .int 0,0
dummy4: .int 0,0

       .text

       movl $0, counter        # reset loop counter

testloop:

#****************   Do any initializations here:    ************************
#
#       movl    var3, %%edx     # Ensure variable is in level 1 cache
#
#****************   End of initializations          ************************

       rdtsc                   # read clock counter
       movl    %%eax, tics     # save count
       cld                     # non-pairable filler
       .rept 8
       nop                     # eight NOP's to avoid shadowing effect
       .endr

#****************   Put instructions to test here:  ************************

       .rept 500
       movl    %%edx, var3
       movl    %%ebx, var3 + 4
       .endr

#********************* End of instructions to test  ************************

       clc                             # non-pairable filler with shadow
       rdtsc                           # read counter again
       subl    tics, %%eax             # compute difference
       subl    $OVERHEAD, %%eax        # subtract the clock cycles used by fillers etc
       movl    counter, %%edx          # loop counter
       movl    %%eax, resultlist(%%edx)# store result in table
       addl    $4, %%edx               # increment counter
       movl    %%edx, counter          # store counter

       cmpl    $(ITER * 4), %%edx
       jb      testloop                # repeat ITER times
       " : : : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" );


/* insert here code to read out the values in RESULTLIST */


       for(i = 0; i < ITER; i++)
               printf("%u\n", resultlist[i]);

       return 0;
}