Only in linux-2.5.41-bk2-kb/arch/i386/kernel: linux diff -urP -I \$Id:.*Exp \$ -X /usr/src/patch.exclude linux-2.5.41-bk2-kb/include/linux/hrtime.h linux/include/linux/hrtime.h --- linux-2.5.41-bk2-kb/include/linux/hrtime.h Wed Dec 31 16:00:00 1969 +++ linux/include/linux/hrtime.h Wed Oct 9 14:07:45 2002 @@ -0,0 +1,156 @@ +#ifndef _HRTIME_H +#define _HRTIME_H + + + +/* + * This file is the glue to bring in the platform stuff. + * We make it all depend on the CONFIG option so all archs + * will work as long as the CONFIG is not set. Once an + * arch defines the CONFIG, it had better have the + * asm/hrtime.h file in place. + */ + +/* + * This gets filled in at init time, either static or dynamic. + * Someday this will be what NTP fiddles with. + * Do we need the scale here? I don't think so, as long as we + * do percentage offsets for NTP. + */ +struct timer_conversion_bits { + unsigned long _arch_to_usec; + unsigned long _arch_to_nsec; + unsigned long _usec_to_arch; + unsigned long _nsec_to_arch; + long _cycles_per_jiffies; + unsigned long _arch_to_latch; +}; +extern struct timer_conversion_bits timer_conversion_bits; +/* + * The following four values are not used for machines + * without a TSC. For machines with a TSC they + * are caculated at boot time. They are used to + * calculate "cycles" to jiffies or usec. Don't get + * confused into thinking they are simple multiples or + * divisors, however. + */ +#define arch_to_usec timer_conversion_bits._arch_to_usec +#define arch_to_nsec timer_conversion_bits._arch_to_nsec +#define usec_to_arch timer_conversion_bits._usec_to_arch +#define nsec_to_arch timer_conversion_bits._nsec_to_arch +#define cycles_per_jiffies timer_conversion_bits._cycles_per_jiffies +#define arch_to_latch timer_conversion_bits._arch_to_latch + +#include +#ifdef CONFIG_HIGH_RES_TIMERS +#include +/* + * The schedule_next_int function is to be defined by the "arch" code + * when an "arch" is implementing the high-res part of POSIX timers. + * The actual function will be called with the offset in "arch" (parm 2) + * defined sub_jiffie units from the reference jiffie boundry (parm 1)to + * the next required sub_jiffie timer interrupt. This value will be -1 + * if the next timer interrupt should be the next jiffie value. The + * "arch" code must determine how far out the interrupt is, based on + * current jiffie, sub_jiffie time and set up the hardware to interrupt + * at that time. It is possible that the time will already have passed, + * in which case the function should return true (no interrupt is + * needed), otherwise the return should be 0. The third parameter is the + * "always" flag which says that the code needs an interrupt, even if the + * time has passed. In this case a "close" in time should be used to + * generate the required interrupt. The sub_jiffie interrupt + * should just call do_timer(). If the interrupt code ususally does stuff + * each jiffie, a flag should be kept by the jiffies update code to + * indicate that a new jiffie has started. This flag is to keep this code + * from being executed on the sub jiffie interrupt. + */ +#ifndef schedule_next_int +#define schedule_next_int(s,d,a) 0 +#undef CONFIG_HIGH_RES_TIMERS +#endif // schedule_next_int +/* + * The sub_jiffie() macro should return the current time offset from the latest + * jiffie. This will be in "arch" defined units and is used to determine if + * a timer has expired. Since no sub_expire value will be used if "arch" + * has not defined the high-res package, 0 will work well here. + * + * In addition, to save time if there is no high-res package (or it is not + * configured), we define the sub expression for the run_timer_list. + */ + +#ifndef sub_jiffie +#undef CONFIG_HIGH_RES_TIMERS +#define sub_jiffie() 0 +#endif // sub_jiffie + +/* + * The high_res_test() macro should set up a test mode that will do a + * worst case timer interrupt. I.e. it may be that a call to + * schedule_next_int() could return -1 indicating that the time has + * already expired. This macro says to set it so that schedule_next_int() + * will always set up a timer interrupt. This is used during init to + * calculate the worst case loop time from timer set up to int to + * the signal code. + + * high_res_end_test() cancels the above state and allows the no + * interrupt return from schedule_next_int() + */ +#ifndef high_res_test +#define high_res_test() +#define high_res_end_test() +#endif + + +#define IF_HIGH_RES(a) a + +#else /* CONFIG_HIGH_RES_TIMERS */ +#define IF_HIGH_RES(a) +#define nsec_to_arch_cycles(a) 0 + +#endif /* CONFIG_HIGH_RES_TIMERS */ + +/* + * Here is an SMP helping macro... + */ +#ifdef CONFIG_SMP +#define if_SMP(a) a +#else +#define if_SMP(a) +#endif +/* + * These should have been defined in the platform hrtimers.h + * If not (or HIGH_RES_TIMERS not configured) define the default. + */ +#ifndef update_jiffies +extern u64 jiffies_64; +#define update_jiffies() (*(u64 *)&jiffies_64)++ +#endif +#ifndef new_jiffie +#define new_jiffie() 0 +#endif +#ifndef schedule_next_int +#define schedule_next_int(a,b,c) +#endif +/* + * If we included a high-res file, we may have gotten a more efficient + * u64/u32, u64%u32 routine. The one in div64.h actually handles a + * u64 result, something we don't need, and, since it is more expensive + * arch porters are encouraged to implement the div_long_long_rem(). + * + * int div_long_long_rem(u64 dividend,int divisor,int* remainder) + * which returns dividend/divisor. + * + * Here we provide default code for those who, for what ever reason, + * have not provided the above. + */ +#ifndef div_long_long_rem +#include + +#define div_long_long_rem(dividend,divisor,remainder) ({ \ + u64 result = dividend; \ + *remainder = do_div(result,divisor); \ + result; }) + +#endif /* ifndef div_long_long_rem */ + +#endif /* _HRTIME_H */ diff -urP -I \$Id:.*Exp \$ -X /usr/src/patch.exclude linux-2.5.41-bk2-kb/include/linux/signal.h linux/include/linux/signal.h --- linux-2.5.41-bk2-kb/include/linux/signal.h Mon Sep 9 10:35:04 2002 +++ linux/include/linux/signal.h Wed Oct 9 14:07:45 2002 @@ -219,6 +219,35 @@ } extern long do_sigpending(void *, unsigned long); +/* + * We would like the asm/signal.h code to define these so that the using + * function can call do_signal(). In loo of that, we define a genaric + * version that pretends that do_signal() was called and delivered a signal. + * To see how this is used, see nano_sleep() in timer.c and the i386 version + * in asm_i386/signal.h. + */ +#ifndef PT_REGS_ENTRY +#define PT_REGS_ENTRY(type,name,p1_type,p1, p2_type,p2) \ +type name(p1_type p1,p2_type p2)\ +{ +#endif +#ifndef _do_signal +#define _do_signal() 1 +#endif +#ifndef NANOSLEEP_ENTRY +#define NANOSLEEP_ENTRY(a) asmlinkage long sys_nanosleep( struct timespec* rqtp, \ + struct timespec * rmtp) \ +{ a +#endif +#ifndef CLOCK_NANOSLEEP_ENTRY +#define CLOCK_NANOSLEEP_ENTRY(a) asmlinkage long sys_clock_nanosleep( \ + clockid_t which_clock, \ + int flags, \ + const struct timespec *rqtp, \ + struct timespec *rmtp) \ +{ a + +#endif #ifndef HAVE_ARCH_GET_SIGNAL_TO_DELIVER struct pt_regs; diff -urP -I \$Id:.*Exp \$ -X /usr/src/patch.exclude linux-2.5.41-bk2-kb/include/linux/time.h linux/include/linux/time.h --- linux-2.5.41-bk2-kb/include/linux/time.h Wed Sep 18 17:04:09 2002 +++ linux/include/linux/time.h Wed Oct 9 14:07:45 2002 @@ -1,7 +1,7 @@ #ifndef _LINUX_TIME_H #define _LINUX_TIME_H -#include +#include #include #ifndef _STRUCT_TIMESPEC @@ -38,6 +38,19 @@ */ #define MAX_JIFFY_OFFSET ((~0UL >> 1)-1) +/* Parameters used to convert the timespec values */ +#ifndef USEC_PER_SEC +#define USEC_PER_SEC (1000000L) +#endif + +#ifndef NSEC_PER_SEC +#define NSEC_PER_SEC (1000000000L) +#endif + +#ifndef NSEC_PER_USEC +#define NSEC_PER_USEC (1000L) +#endif + static __inline__ unsigned long timespec_to_jiffies(struct timespec *value) { @@ -46,16 +59,16 @@ if (sec >= (MAX_JIFFY_OFFSET / HZ)) return MAX_JIFFY_OFFSET; - nsec += 1000000000L / HZ - 1; - nsec /= 1000000000L / HZ; + nsec += NSEC_PER_SEC / HZ - 1; + nsec /= NSEC_PER_SEC / HZ; return HZ * sec + nsec; } static __inline__ void -jiffies_to_timespec(unsigned long jiffies, struct timespec *value) +jiffies_to_timespec(unsigned long _jiffies, struct timespec *value) { - value->tv_nsec = (jiffies % HZ) * (1000000000L / HZ); - value->tv_sec = jiffies / HZ; + value->tv_nsec = (_jiffies % HZ) * (NSEC_PER_SEC / HZ); + value->tv_sec = _jiffies / HZ; } /* Same for "timeval" */ @@ -124,6 +137,7 @@ #ifdef __KERNEL__ extern void do_gettimeofday(struct timeval *tv); extern void do_settimeofday(struct timeval *tv); +extern int do_sys_settimeofday(struct timeval *tv, struct timezone *tz); #endif #define FD_SETSIZE __FD_SETSIZE @@ -140,9 +154,9 @@ #define ITIMER_VIRTUAL 1 #define ITIMER_PROF 2 -struct itimerspec { - struct timespec it_interval; /* timer period */ - struct timespec it_value; /* timer expiration */ +struct itimerspec { + struct timespec it_interval; /* timer period */ + struct timespec it_value; /* timer expiration */ }; struct itimerval { diff -urP -I \$Id:.*Exp \$ -X /usr/src/patch.exclude linux-2.5.41-bk2-kb/include/linux/timer.h linux/include/linux/timer.h --- linux-2.5.41-bk2-kb/include/linux/timer.h Mon Oct 7 15:24:31 2002 +++ linux/include/linux/timer.h Wed Oct 9 14:07:45 2002 @@ -14,6 +14,7 @@ unsigned long data; struct tvec_t_base_s *base; + long sub_expires; }; /*** @@ -46,6 +47,7 @@ extern void add_timer(struct timer_list * timer); extern int del_timer(struct timer_list * timer); extern int mod_timer(struct timer_list *timer, unsigned long expires); +extern void update_real_wall_time(void); #if CONFIG_SMP extern int del_timer_sync(struct timer_list * timer); diff -urP -I \$Id:.*Exp \$ -X /usr/src/patch.exclude linux-2.5.41-bk2-kb/init/Config.help linux/init/Config.help --- linux-2.5.41-bk2-kb/init/Config.help Mon Sep 9 10:35:01 2002 +++ linux/init/Config.help Wed Oct 9 14:07:45 2002 @@ -115,3 +115,13 @@ replacement for kerneld.) Say Y here and read about configuring it in . +Configure timer list size +CONFIG_TIMERLIST_512 + This choice allows you to choose the timer list size you want. The + list insert time is Order(N/size) where N is the number of active + timers. Each list head is 8 bytes, thus a 512 list size requires 4K + bytes. Use larger numbers if you will be using a large number of + timers and are more concerned about list insertion time than the extra + memory usage. (The list size must be a power of 2.) + + diff -urP -I \$Id:.*Exp \$ -X /usr/src/patch.exclude linux-2.5.41-bk2-kb/init/Config.in linux/init/Config.in --- linux-2.5.41-bk2-kb/init/Config.in Mon Sep 9 10:35:03 2002 +++ linux/init/Config.in Wed Oct 9 14:07:45 2002 @@ -9,6 +9,27 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +choice 'Size of timer list?' \ + "512 CONFIG_TIMERLIST_512 \ + 1024 CONFIG_TIMERLIST_1k \ + 2048 CONFIG_TIMERLIST_2k \ + 4096 CONFIG_TIMERLIST_4k \ + 8192 CONFIG_TIMERLIST_8k" 512 +if [ "$CONFIG_TIMERLIST_512" = "y" ]; then + define_int CONFIG_NEW_TIMER_LISTSIZE 512 +fi +if [ "$CONFIG_TIMERLIST_1k" = "y" ]; then + define_int CONFIG_NEW_TIMER_LISTSIZE 1024 +fi +if [ "$CONFIG_TIMERLIST_2k" = "y" ]; then + define_int CONFIG_NEW_TIMER_LISTSIZE 2048 +fi +if [ "$CONFIG_TIMERLIST_4k" = "y" ]; then + define_int CONFIG_NEW_TIMER_LISTSIZE 4096 +fi +if [ "$CONFIG_TIMERLIST_8k" = "y" ]; then + define_int CONFIG_NEW_TIMER_LISTSIZE 8192 +fi endmenu mainmenu_option next_comment diff -urP -I \$Id:.*Exp \$ -X /usr/src/patch.exclude linux-2.5.41-bk2-kb/kernel/ksyms.c linux/kernel/ksyms.c --- linux-2.5.41-bk2-kb/kernel/ksyms.c Wed Oct 9 13:56:12 2002 +++ linux/kernel/ksyms.c Wed Oct 9 14:07:45 2002 @@ -55,6 +55,7 @@ #include #include #include +#include #if defined(CONFIG_PROC_FS) #include @@ -496,6 +497,7 @@ #endif EXPORT_SYMBOL(jiffies); EXPORT_SYMBOL(jiffies_64); +EXPORT_SYMBOL(timer_conversion_bits); EXPORT_SYMBOL(xtime); EXPORT_SYMBOL(do_gettimeofday); EXPORT_SYMBOL(do_settimeofday); diff -urP -I \$Id:.*Exp \$ -X /usr/src/patch.exclude linux-2.5.41-bk2-kb/kernel/timer.c linux/kernel/timer.c --- linux-2.5.41-bk2-kb/kernel/timer.c Mon Oct 7 15:24:33 2002 +++ linux/kernel/timer.c Wed Oct 9 14:07:45 2002 @@ -17,6 +17,8 @@ * 2000-10-05 Implemented scalable SMP per-CPU timer handling. * Copyright (C) 2000, 2001, 2002 Ingo Molnar * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar + * 2002-10-01 High res timers code by George Anzinger + * Copyright (C)2002 by MontaVista Software. */ #include @@ -25,39 +27,46 @@ #include #include +#include +#include +#include #include +#ifndef IF_HIGH_RES +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * ifdef eliminator macro... + */ +#define IF_HIGH_RES(a) a +#else +#define IF_HIGH_RES(a) +#endif +#endif +#ifdef CONFIG_SMP +#undef IF_SMP +#define IF_SMP(a) a +#define SMP 1 +#else +#undef IF_SMP +#define IF_SMP(a) +#define SMP 0 +#endif +#ifndef CONFIG_NEW_TIMER_LISTSIZE +#define CONFIG_NEW_TIMER_LISTSIZE 512 +#endif +#define NEW_TVEC_SIZE CONFIG_NEW_TIMER_LISTSIZE +#define NEW_TVEC_MASK (NEW_TVEC_SIZE - 1) /* * per-CPU timer vector definitions: */ -#define TVN_BITS 6 -#define TVR_BITS 8 -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) - -typedef struct tvec_s { - int index; - struct list_head vec[TVN_SIZE]; -} tvec_t; - -typedef struct tvec_root_s { - int index; - struct list_head vec[TVR_SIZE]; -} tvec_root_t; -typedef struct timer_list timer_t; + struct tvec_t_base_s { spinlock_t lock; unsigned long timer_jiffies; - timer_t *running_timer; - tvec_root_t tv1; - tvec_t tv2; - tvec_t tv3; - tvec_t tv4; - tvec_t tv5; + volatile struct timer_list * volatile running_timer; + struct list_head tv[NEW_TVEC_SIZE]; } ____cacheline_aligned_in_smp; typedef struct tvec_t_base_s tvec_base_t; @@ -67,42 +76,101 @@ /* Fake initialization needed to avoid compiler breakage */ static DEFINE_PER_CPU(struct tasklet_struct, timer_tasklet) = { NULL }; -static inline void internal_add_timer(tvec_base_t *base, timer_t *timer) -{ - unsigned long expires = timer->expires; - unsigned long idx = expires - base->timer_jiffies; - struct list_head *vec; - - if (idx < TVR_SIZE) { - int i = expires & TVR_MASK; - vec = base->tv1.vec + i; - } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { - int i = (expires >> TVR_BITS) & TVN_MASK; - vec = base->tv2.vec + i; - } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - vec = base->tv3.vec + i; - } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - vec = base->tv4.vec + i; - } else if ((signed long) idx < 0) { - /* - * Can happen if you add a timer with expires == jiffies, - * or you set a timer to go off in the past - */ - vec = base->tv1.vec + base->tv1.index; - } else if (idx <= 0xffffffffUL) { - int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - vec = base->tv5.vec + i; - } else { - /* Can only get here on architectures with 64-bit jiffies */ - INIT_LIST_HEAD(&timer->entry); - return; - } - /* - * Timers are FIFO: - */ - list_add_tail(&timer->entry, vec); + static inline void internal_add_timer(tvec_base_t *base, + struct timer_list *timer) + { + /* + * must be cli-ed when calling this + */ + unsigned long expires = timer->expires; + IF_HIGH_RES(int sub_expires = timer->sub_expires;) + int indx; + struct list_head *pos,*list_start; + + if ( time_before(expires, base->timer_jiffies) ){ + /* + * already expired, schedule for next tick + * would like to do better here + * Actually this now works just fine with the + * back up of timer_jiffies in "run_timer_list". + * Note that this puts the timer on a list other + * than the one it idexes to. We don't want to + * change the expires value in the timer as it is + * used by the repeat code in setitimer and the + * POSIX timers code. + */ + expires = base->timer_jiffies; + IF_HIGH_RES(sub_expires = 0); + } + + indx = expires & NEW_TVEC_MASK; + if ((expires - base->timer_jiffies) <= NEW_TVEC_SIZE) { + #ifdef CONFIG_HIGH_RES_TIMERS + unsigned long jiffies_f; + /* + * The high diff bits are the same, goes to the head of + * the list, sort on sub_expire. + */ + for (pos = (list_start = &base->tv[indx])->next; + pos != list_start; + pos = pos->next){ + struct timer_list *tmr = + list_entry(pos, + struct timer_list, + entry); + if ((tmr->sub_expires >= sub_expires) || + (tmr->expires != expires)){ + break; + } + } + list_add_tail(&timer->entry, pos); + /* + * Notes to me. Use jiffies here instead of + * timer_jiffies to prevent adding unneeded interrupts. + * Running_timer is NULL if we are NOT currently + * activly dispatching timers. Since we are under + * the same spin lock, it being false means that + * it has dropped the spinlock to call the timer + * function, which could well be who called us. + * In any case, we don't need a new interrupt as + * the timer dispach code (run_timer_list) will + * pick this up when the function it is calling + * returns. + */ + if ( expires == (jiffies_f = jiffies) && + list_start->next == &timer->entry && + (base->running_timer == NULL)) { + schedule_next_int(jiffies_f, sub_expires,1); + } + #else + pos = (&base->tv[indx])->next; + list_add_tail(&timer->entry, pos); + #endif + }else{ + /* + * The high diff bits differ, search from the tail + * The for will pick up an empty list. + */ + for (pos = (list_start = &base->tv[indx])->prev; + pos != list_start; + pos = pos->prev){ + struct timer_list *tmr = + list_entry(pos, + struct timer_list, + entry); + if (time_after(tmr->expires, expires)){ + continue; + } + IF_HIGH_RES( + if ((tmr->expires != expires) || + (tmr->sub_expires < sub_expires)) { + break; + } + ); + } + list_add(&timer->entry, pos); + } + } /*** @@ -119,7 +187,7 @@ * Timers with an ->expired field in the past will be executed in the next * timer tick. It's illegal to add an already pending timer. */ -void add_timer(timer_t *timer) +void add_timer(struct timer_list *timer) { int cpu = get_cpu(); tvec_base_t *base = tvec_bases + cpu; @@ -153,9 +221,11 @@ * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an * active timer returns 1.) */ -int mod_timer(timer_t *timer, unsigned long expires) +#ifndef CONFIG_HIGH_RES_TIMERS +int mod_timer(struct timer_list *timer, unsigned long expires) { - tvec_base_t *old_base, *new_base; + tvec_base_t *new_base; + IF_SMP( tvec_base_t *old_base;) unsigned long flags; int ret = 0; @@ -170,6 +240,7 @@ local_irq_save(flags); new_base = tvec_bases + smp_processor_id(); +#ifdef CONFIG_SMP repeat: old_base = timer->base; @@ -185,6 +256,74 @@ spin_lock(&new_base->lock); } /* + * Subtle, we rely on timer->base being always + * valid and being updated atomically. + */ + if (timer->base != old_base) { + spin_unlock(&new_base->lock); + spin_unlock(&old_base->lock); + goto repeat; + } + } else +#endif + spin_lock(&new_base->lock); + + timer->expires = expires; + /* + * Delete the previous timeout (if there was any), and install + * the new one: + */ + if (timer->base) { + list_del(&timer->entry); + ret = 1; + } + internal_add_timer(new_base, timer); + timer->base = new_base; + + IF_SMP(if (old_base && (new_base != old_base)) + spin_unlock(&old_base->lock); + ) + spin_unlock_irqrestore(&new_base->lock, flags); + + return ret; +} +#else +/* + * Is this useful? It is not used by POSIX timers. + */ +int mod_timer_hr(struct timer_list *timer, + unsigned long expires, + long sub_expires) +{ + tvec_base_t *new_base; + IF_SMP( tvec_base_t *old_base;) + unsigned long flags; + int ret = 0; + + if (timer_pending(timer) && + timer->expires == expires && + timer->sub_expires == sub_expires) + return 1; + + local_irq_save(flags); + new_base = tvec_bases + smp_processor_id(); +#ifdef CONFIG_SMP + + repeat: + old_base = timer->base; + + /* + * Prevent deadlocks via ordering by old_base < new_base. + */ + if (old_base && (new_base != old_base)) { + if (old_base < new_base) { + spin_lock(&new_base->lock); + spin_lock(&old_base->lock); + } else { + spin_lock(&old_base->lock); + spin_lock(&new_base->lock); + } + /* * The timer base might have changed while we were * trying to take the lock(s): */ @@ -194,26 +333,35 @@ goto repeat; } } else +#endif + spin_lock(&new_base->lock); /* * Delete the previous timeout (if there was any), and install * the new one: */ - if (old_base) { + if (timer->base) { list_del(&timer->entry); ret = 1; } timer->expires = expires; + timer->sub_expires = sub_expires; internal_add_timer(new_base, timer); timer->base = new_base; - if (old_base && (new_base != old_base)) - spin_unlock(&old_base->lock); + IF_SMP(if (old_base && (new_base != old_base)) + spin_unlock(&old_base->lock); + ) spin_unlock_irqrestore(&new_base->lock, flags); return ret; } +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + return mod_timer_hr(timer, expires, timer->sub_expires); +} +#endif /*** * del_timer - deactive a timer. @@ -226,7 +374,7 @@ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an * active timer returns 1.) */ -int del_timer(timer_t *timer) +int del_timer(struct timer_list *timer) { unsigned long flags; tvec_base_t *base; @@ -263,7 +411,8 @@ * * The function returns whether it has deactivated a pending timer or not. */ -int del_timer_sync(timer_t *timer) + +int del_timer_sync(struct timer_list * timer) { tvec_base_t *base = tvec_bases; int i, ret; @@ -287,94 +436,113 @@ } #endif +/* + * run_timer_list is ALWAYS called from softirq which calls with irq enabled. + * We may assume this and not save the flags. + */ -static void cascade(tvec_base_t *base, tvec_t *tv) -{ - /* cascade all the timers from tv up one level */ - struct list_head *head, *curr, *next; - - head = tv->vec + tv->index; - curr = head->next; - /* - * We are removing _all_ timers from the list, so we don't have to - * detach them individually, just clear the list afterwards. - */ - while (curr != head) { - timer_t *tmp; - - tmp = list_entry(curr, timer_t, entry); - if (tmp->base != base) - BUG(); - next = curr->next; - internal_add_timer(base, tmp); - curr = next; - } - INIT_LIST_HEAD(head); - tv->index = (tv->index + 1) & TVN_MASK; -} -/*** - * __run_timers - run all expired timers (if any) on this CPU. - * @base: the timer vector to be processed. - * - * This function cascades all vectors and executes all expired timer - * vectors. - */ -static inline void __run_timers(tvec_base_t *base) +static void __run_timers(tvec_base_t *base) { - unsigned long flags; - - spin_lock_irqsave(&base->lock, flags); + IF_HIGH_RES( unsigned long jiffies_f; + long sub_jiff = -1; + long sub_jiffie_f); + spin_lock_irq(&base->lock); +#ifdef CONFIG_HIGH_RES_TIMERS + read_lock(&xtime_lock); + jiffies_f = jiffies; + sub_jiffie_f = sub_jiffie() + quick_get_cpuctr(); + read_unlock(&xtime_lock); + while ( unlikely(sub_jiffie_f >= cycles_per_jiffies)){ + sub_jiffie_f -= cycles_per_jiffies; + jiffies_f++; + } + while ((long)(jiffies_f - base->timer_jiffies) >= 0) { +#else while ((long)(jiffies - base->timer_jiffies) >= 0) { - struct list_head *head, *curr; +#endif + struct list_head *head, *curr; + head = base->tv + + (base->timer_jiffies & NEW_TVEC_MASK); /* - * Cascade timers: + * Note that we never move "head" but continue to + * pick the first entry from it. This allows new + * entries to be inserted while we unlock for the + * function call below. */ - if (!base->tv1.index) { - cascade(base, &base->tv2); - if (base->tv2.index == 1) { - cascade(base, &base->tv3); - if (base->tv3.index == 1) { - cascade(base, &base->tv4); - if (base->tv4.index == 1) - cascade(base, &base->tv5); - } - } - } repeat: - head = base->tv1.vec + base->tv1.index; curr = head->next; if (curr != head) { void (*fn)(unsigned long); unsigned long data; - timer_t *timer; + struct timer_list *timer; - timer = list_entry(curr, timer_t, entry); - fn = timer->function; - data = timer->data; + timer = list_entry(curr, struct timer_list, entry); +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * This would be simpler if we never got behind + * i.e. if timer_jiffies == jiffies, we could + * drop one of the tests. Since we may get + * behind, (in fact we don't up date until + * we are behind to allow sub_jiffie entries) + * we need a way to negate the sub_jiffie + * test in that case... + */ + if (time_before(timer->expires, jiffies_f)|| + ((timer->expires == jiffies_f) && + timer->sub_expires <= sub_jiffie_f)) +#else + if (time_before_eq(timer->expires, jiffies)) +#endif + {fn = timer->function; + data= timer->data; - list_del(&timer->entry); - timer->base = NULL; -#if CONFIG_SMP - base->running_timer = timer; + list_del(&timer->entry); + timer->base = NULL; + timer->entry.next = timer->entry.prev = NULL; + base->running_timer = timer; + spin_unlock_irq(&base->lock); + fn(data); + spin_lock_irq(&base->lock); + goto repeat; + } +#ifdef CONFIG_HIGH_RES_TIMERS + else{ + /* + * The new timer list is not always emptied + * here as it contains: + * a.) entries (list size)^N*jiffies out and + * b.) entries that match in jiffies, but have + * sub_expire times further out than now. + */ + if (timer->expires == jiffies_f ){ + sub_jiff = timer->sub_expires; + } + } #endif - spin_unlock_irq(&base->lock); - fn(data); - spin_lock_irq(&base->lock); - goto repeat; } ++base->timer_jiffies; - base->tv1.index = (base->tv1.index + 1) & TVR_MASK; } -#if CONFIG_SMP + /* + * It is faster to back out the last bump, than to prevent it. + * This allows zero time inserts as well as sub_jiffie values in + * the current jiffie. Did not work for the cascade as tv1.index + * also needs adjusting. + */ + --base->timer_jiffies; base->running_timer = NULL; -#endif - spin_unlock_irqrestore(&base->lock, flags); -} -/******************************************************************/ + IF_HIGH_RES(if (schedule_next_int( jiffies_f, sub_jiff, 0)){ + /* + * If schedule_next_int says the time has passed + * bump the tasklet lock so we go round again + */ + run_local_timers(); + }); + spin_unlock_irq(&base->lock); +} /* * Timekeeping variables */ @@ -654,15 +822,37 @@ /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. + * + * Here is where we need to sort out the sub-jiffie interrupts from the + * jiffie ones and make sure we only do accounting once per jiffie per cpu. + * We do this by using new_jiffie as a bit per cpu. All ops are atomic. */ +/* + * This read-write spinlock protects us from races in SMP while + * playing with xtime and avenrun. + */ +rwlock_t xtime_lock __cacheline_aligned_in_smp = RW_LOCK_UNLOCKED; + void update_process_times(int user_tick) { struct task_struct *p = current; int cpu = smp_processor_id(), system = user_tick ^ 1; - update_one_process(p, user_tick, system, cpu); + /* + * always run the timer list to pick up sub-jiffie timers + */ run_local_timers(); - scheduler_tick(user_tick, system); + + /* + * If high-res, we come here more often that 1/HZ. Don't pass + * the extra calls to those who only want the 1/HZ call. + */ +#ifdef CONFIG_HIGH_RES_TIMERS + if (test_and_clear_bit(cpu, (volatile unsigned long *)&new_jiffie())) +#endif + { update_one_process(p, user_tick, system, cpu); + scheduler_tick(user_tick, system); + } } /* @@ -681,35 +871,38 @@ * * Requires xtime_lock to access. */ -unsigned long avenrun[3]; /* - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. + * calc_load - (runs on above timer), update the avenrun load estimates. + * This is called from soft_irq context, ints on, bh locked. */ -static inline void calc_load(unsigned long ticks) +unsigned long avenrun[3]; +static inline void calc_load(void); + +struct timer_list calc_load_timer = { + expires: LOAD_FREQ, + function:(void (*)(unsigned long))calc_load, + entry: {0,0} }; + +static inline void calc_load(void) { unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - count -= ticks; - if (count < 0) { - count += LOAD_FREQ; - active_tasks = count_active_tasks(); - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - } + active_tasks = count_active_tasks(); + write_lock_irq(&xtime_lock); + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); + write_unlock_irq(&xtime_lock); + + calc_load_timer.expires = jiffies + LOAD_FREQ; + add_timer(&calc_load_timer); } + /* jiffies at the most recent update of wall time */ unsigned long wall_jiffies; -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. - */ -rwlock_t xtime_lock __cacheline_aligned_in_smp = RW_LOCK_UNLOCKED; unsigned long last_time_offset; /* @@ -719,8 +912,7 @@ { tvec_base_t *base = tvec_bases + smp_processor_id(); - if ((long)(jiffies - base->timer_jiffies) >= 0) - __run_timers(base); + __run_timers(base); } /* @@ -745,8 +937,25 @@ update_wall_time(ticks); } last_time_offset = 0; - calc_load(ticks); } +#ifdef CONFIG_HIGH_RES_TIMERS +void update_real_wall_time(void) +{ + unsigned long ticks; + /* + * To get the time of day really right, we need to make sure + * every one is on the same jiffie. (Because of adj_time, etc.) + * So we provide this for the high res code. Must be called + * under the write(xtime_lock). (External locking allows the + * caller to include sub jiffies in the lock region.) + */ + ticks = jiffies - wall_jiffies; + if (ticks) { + wall_jiffies += ticks; + update_wall_time(ticks); + } +} +#endif /* * The 64-bit jiffies value is not atomic - you MUST NOT read it @@ -756,12 +965,12 @@ void do_timer(struct pt_regs *regs) { - jiffies_64++; -#ifndef CONFIG_SMP - /* SMP process accounting uses the local APIC timer */ - - update_process_times(user_mode(regs)); -#endif + update_jiffies(); + /* + * SMP process accounting uses the local APIC timer + */ + if (!SMP ) + update_process_times(user_mode(regs)); update_times(); } @@ -770,7 +979,7 @@ extern int do_setitimer(int, struct itimerval *, struct itimerval *); /* - * For backwards compatibility? This can be done in libc so Alpha + * For backwards compatibility? This can be done in libc so Alpha * and all newer ports shouldn't need it. */ asmlinkage unsigned long sys_alarm(unsigned int seconds) @@ -872,7 +1081,7 @@ asmlinkage long sys_getegid(void) { /* Only we change this so SMP safe */ - return current->egid; + return current->egid; } #endif @@ -910,7 +1119,7 @@ */ signed long schedule_timeout(signed long timeout) { - timer_t timer; + struct timer_list timer; unsigned long expire; switch (timeout) @@ -966,10 +1175,31 @@ return current->pid; } -asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) +#if 0 +// This #if 0 is to keep the pretty printer/ formatter happy so the indents will +// correct below. +// The NANOSLEEP_ENTRY macro is defined in asm/signal.h and +// is structured to allow code as well as entry definitions, so that when +// we get control back here the entry parameters will be available as expected. +// Some systems may find these paramerts in other ways than as entry parms, +// for example, struct pt_regs *regs is defined in i386 as the address of the +// first parameter, where as other archs pass it as one of the paramerters. +asmlinkage long sys_nanosleep(void) { - struct timespec t; - unsigned long expire; +#endif + NANOSLEEP_ENTRY( struct timespec t; + unsigned long expire;) + +#ifndef FOLD_NANO_SLEEP_INTO_CLOCK_NANO_SLEEP + // The following code expects rqtp, rmtp to be available as a result of + // the above macro. Also any regs needed for the _do_signal() macro + // shoule be set up here. + + //asmlinkage long sys_nanosleep(struct timespec *rqtp, + // struct timespec *rmtp) + // { + // struct timespec t; + // unsigned long expire; if(copy_from_user(&t, rqtp, sizeof(struct timespec))) return -EFAULT; @@ -979,8 +1209,9 @@ expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire); + do { + current->state = TASK_INTERRUPTIBLE; + } while((expire = schedule_timeout(expire)) && !_do_signal()); if (expire) { if (rmtp) { @@ -992,7 +1223,7 @@ } return 0; } - +#endif // ! FOLD_NANO_SLEEP_INTO_CLOCK_NANO_SLEEP /* * sys_sysinfo - fill in sysinfo struct */ @@ -1073,14 +1304,9 @@ base = tvec_bases + i; spin_lock_init(&base->lock); - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); + for (j = 0; j < NEW_TVEC_SIZE; j++) + INIT_LIST_HEAD(base->tv + j); tasklet_init(&per_cpu(timer_tasklet, i), run_timer_tasklet, 0); } + calc_load(); }