Tickless kernelにより、既存の和文資料の示すところから離れだす。
仕方が無いのでソースコードを頼りに追いかけることにする。
主目的は最低限必要なタイマリソースはなにか、の確認。
scheduler、jiffiesの更新がなされれば、要件を満たすので、更新処理を遡る。
kernelはv3.10を参照した(LTSI候補だというのを見かけたはず?)
システム時刻:xtime だったはず
jiffies更新箇所
/*
* The 64-bit jiffies value is not atomic - you MUST NOT read it
* without sampling the sequence number in xtime_lock.
* jiffies is defined in the linker script...
*/
void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
update_wall_time();
calc_global_load(ticks);
}
コイツが大ボス. jiffiesそのものじゃないけどいいんか( system call "times"で返している)
jiffies更新は別か..
→一緒だった。物理メモリで同一アドレスにしてあるので、64の更新がjiffies更新と一致する(Little Endian)。
FILE: System.map
8xxxxxxx D jiffies
8xxxxxxx D jiffies_64
void xtime_update(unsigned long ticks) write_seqlock(&xtime_lock); do_timer(ticks); write_sequnlock(&xtime_lock);
FILE: kernel/time/tick-sched.c
/*
* Must be called with interrupts disabled !
*/
static void tick_do_update_jiffies64(ktime_t now)
{
unsigned long ticks = 0;
ktime_t delta;
/*
* Do a quick check without holding xtime_lock:
*/
delta = ktime_sub(now, last_jiffies_update);
if (delta.tv64 < tick_period.tv64)
return;
/* Reevalute with xtime_lock held */
write_seqlock(&xtime_lock);
delta = ktime_sub(now, last_jiffies_update);
if (delta.tv64 >= tick_period.tv64) {
delta = ktime_sub(delta, tick_period);
last_jiffies_update = ktime_add(last_jiffies_update,
tick_period);
/* Slow path for long timeouts */
if (unlikely(delta.tv64 >= tick_period.tv64)) {
s64 incr = ktime_to_ns(tick_period);
ticks = ktime_divns(delta, incr);
last_jiffies_update = ktime_add_ns(last_jiffies_update,
incr * ticks);
}
do_timer(++ticks);
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
}
write_sequnlock(&xtime_lock);
}
tick_do_update_jiffies64()を呼び出すヒト.
static void tick_nohz_update_jiffies(ktime_t now)
void tick_nohz_stop_sched_tick(int inidle)
tick_program_event()って?
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
void tick_nohz_restart_sched_tick(void)
static void tick_nohz_handler(struct clock_event_device *dev) // The nohz low res interrupt handler
static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) // high-res. timer持っているとき.
いずれも、ktime_get();で現時刻をとってきている(CPU寝ていても回っているカウンタを想定)
FILE: kernel/time/timekeeping.c
ktime_t ktime_get(void)
{
unsigned int seq;
s64 secs, nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqbegin(&xtime_lock);
secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
nsecs += timekeeping_get_ns();
/* If arch requires, add in gettimeoffset() */
nsecs += arch_gettimeoffset();
} while (read_seqretry(&xtime_lock, seq));
/*
* Use ktime_set/ktime_add_ns to create a proper ktime on
* 32-bit architectures without CONFIG_KTIME_SCALAR.
*/
return ktime_add_ns(ktime_set(secs, 0), nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);
ktime_get()は、xtimeを見ている。あれ..?
コチラを見ると、http://d.hatena.ne.jp/enakai00/20111117/1321508379
別の所でハードウェアリソースを参照していた。
FILE: kernel/time/timekeeping.c
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
* Called from the timer interrupt, must hold a write on xtime_lock.
*/
static void update_wall_time(void)
{
struct clocksource *clock;
cycle_t offset;
int shift = 0, maxshift;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
return;
clock = timekeeper.clock; // ★★コレがSoC全域で使える、cpu idleでも時間を刻むクロックソースを握っている。
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = timekeeper.cycle_interval;
#else
offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#endif
timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* (think "ticks") worth of time at once. To do this efficiently,
* we calculate the largest doubling multiple of cycle_intervals
* that is smaller then the offset. We then accumulate that
* chunk in one go, and then try to consume the next smaller
* doubled multiple.
*/
shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
shift = max(0, shift);
/* Bound shift to one less then what overflows tick_length */
maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
shift = min(shift, maxshift);
while (offset >= timekeeper.cycle_interval) {
offset = logarithmic_accumulation(offset, shift);
if(offset < timekeeper.cycle_interval<<shift)
shift--;
}
/* correct the clock when NTP error is too big */
timekeeping_adjust(offset);
/*
* Since in the loop above, we accumulate any amount of time
* in xtime_nsec over a second into xtime.tv_sec, its possible for
* xtime_nsec to be fairly small after the loop. Further, if we're
* slightly speeding the clocksource up in timekeeping_adjust(),
* its possible the required corrective factor to xtime_nsec could
* cause it to underflow.
*
* Now, we cannot simply roll the accumulated second back, since
* the NTP subsystem has been notified via second_overflow. So
* instead we push xtime_nsec forward by the amount we underflowed,
* and add that amount into the error.
*
* We'll correct this error next time through this function, when
* xtime_nsec is not as small.
*/
if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
s64 neg = -(s64)timekeeper.xtime_nsec;
timekeeper.xtime_nsec = 0;
timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
}
/*
* Store full nanoseconds into xtime after rounding it up and
* add the remainder to the error difference.
*/
xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
timekeeper.ntp_error += timekeeper.xtime_nsec <<
timekeeper.ntp_error_shift;
/*
* Finally, make sure that after the rounding
* xtime.tv_nsec isn't larger then NSEC_PER_SEC
*/
if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
int leap;
xtime.tv_nsec -= NSEC_PER_SEC;
xtime.tv_sec++;
leap = second_overflow(xtime.tv_sec);
xtime.tv_sec += leap;
wall_to_monotonic.tv_sec -= leap;
if (leap)
clock_was_set_delayed();
}
timekeeping_update(false);
}
FILE: kernel/time/timekeeping.c
/**
* timekeeper_setup_internals - Set up internals to use clocksource clock.
*
* @clock: Pointer to clocksource.
*
* Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
* pair and interval request.
*
* Unless you're the timekeeping code, you should not be using this!
*/
static void timekeeper_setup_internals(struct clocksource *clock)
{
cycle_t interval;
u64 tmp, ntpinterval;
timekeeper.clock = clock;
clock->cycle_last = clock->read(clock);
以下略
この関数でセットしている。張り替えもできる模様。
static int change_clocksource(void *data)
FILE: kernel/time/timekeeping.c
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
void __init timekeeping_init(void)
{
struct clocksource *clock;
unsigned long flags;
struct timespec now, boot;
read_persistent_clock(&now);
read_boot_clock(&boot);
write_seqlock_irqsave(&xtime_lock, flags);
ntp_init();
clock = clocksource_default_clock();
if (clock->enable)
clock->enable(clock);
timekeeper_setup_internals(clock);
...
おうふ...
clocksource_default_clock()のデフォルトは jiffiesを返すだけの論理的なクロックソースだ。
kernel/time/jiffies.c
ドライバの初期化処理で、タイマリソースを追加して、張り替える処理が走っていると想像。
/**
* timekeeping_notify - Install a new clock source
* @clock: pointer to the clock source
*
* This function is called from clocksource.c after a new, better clock
* source has been registered. The caller holds the clocksource_mutex.
*/
void timekeeping_notify(struct clocksource *clock)
{
if (timekeeper.clock == clock)
return;
stop_machine(change_clocksource, clock, NULL);
tick_clock_notify();
}
これか. inline関数になってる。SMPとSTOP_MACHINEが定義されていなければ関数呼び出しに置換される。
FILE: include/linux/stop_machine.h
change_clocksource( clock );
FILE: kernel/time/clocksource.c
#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
/**
* clocksource_select - Select the best clocksource available
*
* Private function. Must hold clocksource_mutex when called.
*
* Select the clocksource with the best rating, or the clocksource,
* which is selected by userspace override.
*/
static void clocksource_select(void)
コレ呼び出しているところが多数ある。。
clocksource_listの先頭がbest ratingになるらしい?
→追加するときに大きい物順になるようにコーディングしてある。
static void clocksource_enqueue(struct clocksource *cs)
なんとなく、登録情報で上限が定まるから、ここからタイマーを張っているような気がする。
タイマを貼ってしまったらTicklessじゃないような気がしないでもないよなぁ。
FILE: include/linux/clocksource.h
/*
* Don't call __clocksource_register_scale directly, use
* clocksource_register_hz/khz
*/
extern int
__clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq);
extern void
__clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq);
static inline int clocksource_register_hz(struct clocksource *cs, u32 hz)
{
return __clocksource_register_scale(cs, 1, hz);
}
FILE: kernel/time/clocksource.c
/**
* __clocksource_register_scale - Used to install new clocksources
* @t: clocksource to be registered
* @scale: Scale factor multiplied against freq to get clocksource hz
* @freq: clocksource frequency (cycles per second) divided by scale
*
* Returns -EBUSY if registration fails, zero otherwise.
*
* This *SHOULD NOT* be called directly! Please use the
* clocksource_register_hz() or clocksource_register_khz helper functions.
*/
int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{
/* Initialize mult/shift and max_idle_ns */
__clocksource_updatefreq_scale(cs, scale, freq); // ★★★ここでcs->max_idle_nsに最長時間を設定してくれる。
/* Add clocksource to the clcoksource list */
mutex_lock(&clocksource_mutex);
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
}
EXPORT_SYMBOL_GPL(__clocksource_register_scale);
この最長時間を返す関数が以下にある。(timerクロックリソースを保持するオブジェクト(ファイルスコープ))
FILE: kernel/time/timekeeping.c
u64 timekeeping_max_deferment(void)
これを使っていて、スケジューラ絡みのソースは以下。
FILE: kernel/time/tick-sched.c
/**
* tick_nohz_stop_sched_tick - stop the idle tick from the idle task
*
* When the next event is more than a tick into the future, stop the idle tick
* Called either from the idle loop or from irq_exit() when an idle period was
* just interrupted by an interrupt which did not cause a reschedule.
*/
void tick_nohz_stop_sched_tick(int inidle)
{
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
hrtimer_start(&ts->sched_timer, expires,
HRTIMER_MODE_ABS_PINNED);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
goto out;
} else if (!tick_program_event(expires, 0))
goto out;
tickを止める前に、タイマを貼ってますなぁ。ハイレゾが有効ならそれ。違えば普通の。
ハイレゾタイマが存在している場合。
FILE: kernel/hrtimer.c
/**
* hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
* @tim: expiry time
* @delta_ns: "slack" range for the timer
* @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
*
* Returns:
* 0 on success
* 1 when the timer was active
*/
int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
unsigned long delta_ns, const enum hrtimer_mode mode)
{
return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
"nohz_mode = NOHZ_MODE_HIGHRES"を設定しているのは、以下。
/**
* tick_setup_sched_timer - setup the tick emulation timer
*/
void tick_setup_sched_timer(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t now = ktime_get();
/*
* Emulate tick processing via per-CPU hrtimers:
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
ts->sched_timer.function = tick_sched_timer;
/* Get the next period (per cpu) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
hrtimer_start_expires(&ts->sched_timer,
HRTIMER_MODE_ABS_PINNED);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
now = ktime_get();
}
#ifdef CONFIG_NO_HZ
if (tick_nohz_enabled) {
ts->nohz_mode = NOHZ_MODE_HIGHRES;
printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
}
#endif
}
#endif /* HIGH_RES_TIMERS */
普通の。
/**
* tick_program_event
*/
int tick_program_event(ktime_t expires, int force)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
return tick_dev_program_event(dev, expires, force);
}
タイマリソースの張替えが自由にできてしまうので、
AMPで同一カウンタを参照するような実装をしたければ、
抜けないようにしておくのが良い。stop requestでnack返せばいいかなぁ?(要確認)
ハイレゾタイマがあれば、それを使ってしまうぽい?
FILE: kernel/timer.c
/*
* This function runs timers and the timer-tq in bottom half context.
*/
static void run_timer_softirq(struct softirq_action *h)
{
struct tvec_base *base = __this_cpu_read(tvec_bases);
hrtimer_run_pending();
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
}
void __init init_timers(void)
{
int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
init_timer_stats();
BUG_ON(err != NOTIFY_OK);
register_cpu_notifier(&timers_nb);
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
FILE: kernel/timer.c
/*
* Called from timer softirq every jiffy, expire hrtimers:
*
* For HRT its the fall back code to run the softirq in the timer
* softirq context in case the hrtimer initialization failed or has
* not been done yet.
*/
void hrtimer_run_pending(void)
{
if (hrtimer_hres_active()) //★★ ->hres_activeを返す。
return;
/*
* This _is_ ugly: We have to check in the softirq context,
* whether we can switch to highres and / or nohz mode. The
* clocksource switch happens in the timer interrupt with
* xtime_lock held. Notification from there only sets the
* check bit in the tick_oneshot code, otherwise we might
* deadlock vs. xtime_lock.
*/
if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) // hrtimer_is_hres_enabled()は真になりやすい(hrtimer_hres_enabledを返す)
hrtimer_switch_to_hres();
}
hrtimer_hres_active()が
/*
* Switch to high resolution mode
*/
static int hrtimer_switch_to_hres(void)
{
int i, cpu = smp_processor_id();
struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
unsigned long flags;
if (base->hres_active)
return 1;
local_irq_save(flags);
if (tick_init_highres()) {
local_irq_restore(flags);
printk(KERN_WARNING "Could not switch to high resolution "
"mode on CPU %d\n", cpu);
return 0;
}
base->hres_active = 1;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
base->clock_base[i].resolution = KTIME_HIGH_RES;
tick_setup_sched_timer();
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
local_irq_restore(flags);
return 1;
}
clocksourceとclockeventとhrtimerと。
clockeventがhrtimerの代わりになれる?
clocksourceは、永続的に回るカウンタで、NO_HZに必要、か。
起き上がるのに別途タイマが必要になってくる。
(compare "以上"で割込みが出せるなら、一発でいける、か?)
hrtimerとしては登録しない
参考
http://www.spinics.net/lists/linux-rt-users/msg06427.html
clocksource
カレンダ時計みたいなもん。NO_HZで起き上がった時にデルタを見るために使う。
登録したratioの最大値のものを採用する(名前を指定して動的にもとれる)
デフォルトがjiffiesになっているので、昔ながらの定周期割込みでも運用できるぽい。
clockevent
hrtimerにも使われる(どこでそうしているか未確認)
属性をしっかりと設定して、PERIODIC,ONESHOTがアレば良さそう。
インタフェースと使われ方を照会して、インプリする。
ドキュメントはあまりない
(2013/03のコミットで、timerまわりの設計思想はthomasが書いているけど、ドライバの実装の話はナシ)
これ、ticklessだと?無効ですよ..
FILE: arm/kernel/time.c
#ifndef CONFIG_GENERIC_CLOCKEVENTS
/*
* Kernel system timer support.
*/
void timer_tick(void)
{
profile_tick(CPU_PROFILING);
do_leds();
xtime_update(1);
#ifndef CONFIG_SMP
update_process_times(user_mode(get_irq_regs()));
#endif
}
#endif
ARMのbootメモは、以下も詳しい。なぞる感じになるなぁ
http://amitshah.bizhat.com/arm/arm_linux_boot-1.html