From a849a0273d0f73a252d14d31c5003a8059ea51fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:37 +0200 Subject: [PATCH 001/140] ntp: Remove unused tick_nsec tick_nsec is only updated in the NTP core, but there are no users. Remove it. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-1-2d52f4e13476@linutronix.de --- arch/x86/include/asm/timer.h | 2 -- include/linux/timex.h | 1 - kernel/time/ntp.c | 8 ++------ 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 7365dd4acffb65..23baf8c9b34ca7 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -6,8 +6,6 @@ #include #include -#define TICK_SIZE (tick_nsec / 1000) - unsigned long long native_sched_clock(void); extern void recalibrate_cpu_khz(void); diff --git a/include/linux/timex.h b/include/linux/timex.h index 3871b06bd302ce..7f7a12fd8200ce 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -145,7 +145,6 @@ unsigned long random_get_entropy_fallback(void); * estimated error = NTP dispersion. */ extern unsigned long tick_usec; /* USER_HZ period (usec) */ -extern unsigned long tick_nsec; /* SHIFTED_HZ period (nsec) */ /* Required to safely shift negative values */ #define shift_right(x, s) ({ \ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 802b336f4b8c2f..c17cc9d857bc11 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -33,9 +33,6 @@ /* USER_HZ period (usecs): */ unsigned long tick_usec = USER_TICK_USEC; -/* SHIFTED_HZ period (nsecs): */ -unsigned long tick_nsec; - static u64 tick_length; static u64 tick_length_base; @@ -253,8 +250,8 @@ static inline int ntp_synced(void) */ /* - * Update (tick_length, tick_length_base, tick_nsec), based - * on (tick_usec, ntp_tick_adj, time_freq): + * Update tick_length and tick_length_base, based on tick_usec, ntp_tick_adj and + * time_freq: */ static void ntp_update_frequency(void) { @@ -267,7 +264,6 @@ static void ntp_update_frequency(void) second_length += ntp_tick_adj; second_length += time_freq; - tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; new_base = div_u64(second_length, NTP_INTERVAL_FREQ); /* From 66606a93849bfe3cbe9f0b801b40f60b87c54e11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:38 +0200 Subject: [PATCH 002/140] ntp: Make tick_usec static There are no users of tick_usec outside of the NTP core code. Therefore make tick_usec static. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-2-2d52f4e13476@linutronix.de --- include/linux/timex.h | 7 ------- kernel/time/ntp.c | 5 ++++- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/linux/timex.h b/include/linux/timex.h index 7f7a12fd8200ce..4ee32eff3f2214 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -139,13 +139,6 @@ unsigned long random_get_entropy_fallback(void); #define MAXSEC 2048 /* max interval between updates (s) */ #define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */ -/* - * kernel variables - * Note: maximum error = NTP sync distance = dispersion + delay / 2; - * estimated error = NTP dispersion. - */ -extern unsigned long tick_usec; /* USER_HZ period (usec) */ - /* Required to safely shift negative values */ #define shift_right(x, s) ({ \ __typeof__(x) __x = (x); \ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c17cc9d857bc11..ed15ec993a829f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -31,7 +31,7 @@ /* USER_HZ period (usecs): */ -unsigned long tick_usec = USER_TICK_USEC; +static unsigned long tick_usec = USER_TICK_USEC; static u64 tick_length; static u64 tick_length_base; @@ -44,6 +44,9 @@ static u64 tick_length_base; /* * phase-lock loop variables + * + * Note: maximum error = NTP sync distance = dispersion + delay / 2; + * estimated error = NTP dispersion. */ /* From a0581cdb2e5d3ad633e51a945b6f0527ce70b68a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:39 +0200 Subject: [PATCH 003/140] ntp: Clean up comments Usage of different comment formatting makes fast reading and parsing the code harder. There are several multi-line comments which do not follow the coding style by starting with a line only containing '/*'. There are also comments which do not start with capitals. Clean up all those comments to be consistent and remove comments which document the obvious. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-3-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 144 +++++++++++++++++++++++++--------------------- 1 file changed, 78 insertions(+), 66 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index ed15ec993a829f..e78d3cd1ec321b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -119,7 +119,8 @@ static long pps_stbcnt; /* stability limit exceeded */ static long pps_errcnt; /* calibration errors */ -/* PPS kernel consumer compensates the whole phase error immediately. +/* + * PPS kernel consumer compensates the whole phase error immediately. * Otherwise, reduce the offset by a fixed factor times the time constant. */ static inline s64 ntp_offset_chunk(s64 offset) @@ -132,8 +133,7 @@ static inline s64 ntp_offset_chunk(s64 offset) static inline void pps_reset_freq_interval(void) { - /* the PPS calibration interval may end - surprisingly early */ + /* The PPS calibration interval may end surprisingly early */ pps_shift = PPS_INTMIN; pps_intcnt = 0; } @@ -151,9 +151,9 @@ static inline void pps_clear(void) pps_freq = 0; } -/* Decrease pps_valid to indicate that another second has passed since - * the last PPS signal. When it reaches 0, indicate that PPS signal is - * missing. +/* + * Decrease pps_valid to indicate that another second has passed since the + * last PPS signal. When it reaches 0, indicate that PPS signal is missing. */ static inline void pps_dec_valid(void) { @@ -174,17 +174,21 @@ static inline void pps_set_freq(s64 freq) static inline int is_error_status(int status) { return (status & (STA_UNSYNC|STA_CLOCKERR)) - /* PPS signal lost when either PPS time or - * PPS frequency synchronization requested + /* + * PPS signal lost when either PPS time or PPS frequency + * synchronization requested */ || ((status & (STA_PPSFREQ|STA_PPSTIME)) && !(status & STA_PPSSIGNAL)) - /* PPS jitter exceeded when - * PPS time synchronization requested */ + /* + * PPS jitter exceeded when PPS time synchronization + * requested + */ || ((status & (STA_PPSTIME|STA_PPSJITTER)) == (STA_PPSTIME|STA_PPSJITTER)) - /* PPS wander exceeded or calibration error when - * PPS frequency synchronization requested + /* + * PPS wander exceeded or calibration error when PPS + * frequency synchronization requested */ || ((status & STA_PPSFREQ) && (status & (STA_PPSWANDER|STA_PPSERROR))); @@ -270,8 +274,8 @@ static void ntp_update_frequency(void) new_base = div_u64(second_length, NTP_INTERVAL_FREQ); /* - * Don't wait for the next second_overflow, apply - * the change to the tick length immediately: + * Don't wait for the next second_overflow, apply the change to the + * tick length immediately: */ tick_length += new_base - tick_length_base; tick_length_base = new_base; @@ -307,10 +311,7 @@ static void ntp_update_offset(long offset) offset *= NSEC_PER_USEC; } - /* - * Scale the phase adjustment and - * clamp to the operating range. - */ + /* Scale the phase adjustment and clamp to the operating range. */ offset = clamp(offset, -MAXPHASE, MAXPHASE); /* @@ -349,7 +350,8 @@ static void ntp_update_offset(long offset) */ void ntp_clear(void) { - time_adjust = 0; /* stop active adjtime() */ + /* Stop active adjtime() */ + time_adjust = 0; time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; @@ -387,7 +389,7 @@ ktime_t ntp_get_next_leap(void) } /* - * this routine handles the overflow of the microsecond field + * This routine handles the overflow of the microsecond field * * The tricky bits of code to handle the accurate clock support * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. @@ -452,7 +454,6 @@ int second_overflow(time64_t secs) break; } - /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; if (time_maxerror > NTP_PHASE_LIMIT) { @@ -696,7 +697,7 @@ static inline void process_adj_status(const struct __kernel_timex *txc) time_state = TIME_OK; time_status = STA_UNSYNC; ntp_next_leap_sec = TIME64_MAX; - /* restart PPS frequency calibration */ + /* Restart PPS frequency calibration */ pps_reset_freq_interval(); } @@ -707,7 +708,7 @@ static inline void process_adj_status(const struct __kernel_timex *txc) if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) time_reftime = __ktime_get_real_seconds(); - /* only set allowed bits */ + /* Only set allowed bits */ time_status &= STA_RONLY; time_status |= txc->status & ~STA_RONLY; } @@ -729,7 +730,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, time_freq = txc->freq * PPM_SCALE; time_freq = min(time_freq, MAXFREQ_SCALED); time_freq = max(time_freq, -MAXFREQ_SCALED); - /* update pps_freq */ + /* Update pps_freq */ pps_set_freq(time_freq); } @@ -762,7 +763,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, /* - * adjtimex mainly allows reading (and writing, if superuser) of + * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, @@ -806,8 +807,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, txc->offset = (u32)txc->offset / NSEC_PER_USEC; } - result = time_state; /* mostly `TIME_OK' */ - /* check for errors */ + result = time_state; if (is_error_status(time_status)) result = TIME_ERROR; @@ -822,7 +822,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, txc->tick = tick_usec; txc->tai = *time_tai; - /* fill PPS status fields */ + /* Fill PPS status fields */ pps_fill_timex(txc); txc->time.tv_sec = ts->tv_sec; @@ -853,17 +853,21 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, #ifdef CONFIG_NTP_PPS -/* actually struct pps_normtime is good old struct timespec, but it is +/* + * struct pps_normtime is basically a struct timespec, but it is * semantically different (and it is the reason why it was invented): * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] - * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) + */ struct pps_normtime { s64 sec; /* seconds */ long nsec; /* nanoseconds */ }; -/* normalize the timestamp so that nsec is in the - ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +/* + * Normalize the timestamp so that nsec is in the + * [ -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval + */ static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) { struct pps_normtime norm = { @@ -879,7 +883,7 @@ static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) return norm; } -/* get current phase correction and jitter */ +/* Get current phase correction and jitter */ static inline long pps_phase_filter_get(long *jitter) { *jitter = pps_tf[0] - pps_tf[1]; @@ -890,7 +894,7 @@ static inline long pps_phase_filter_get(long *jitter) return pps_tf[0]; } -/* add the sample to the phase filter */ +/* Add the sample to the phase filter */ static inline void pps_phase_filter_add(long err) { pps_tf[2] = pps_tf[1]; @@ -898,8 +902,9 @@ static inline void pps_phase_filter_add(long err) pps_tf[0] = err; } -/* decrease frequency calibration interval length. - * It is halved after four consecutive unstable intervals. +/* + * Decrease frequency calibration interval length. It is halved after four + * consecutive unstable intervals. */ static inline void pps_dec_freq_interval(void) { @@ -912,8 +917,9 @@ static inline void pps_dec_freq_interval(void) } } -/* increase frequency calibration interval length. - * It is doubled after four consecutive stable intervals. +/* + * Increase frequency calibration interval length. It is doubled after + * four consecutive stable intervals. */ static inline void pps_inc_freq_interval(void) { @@ -926,7 +932,8 @@ static inline void pps_inc_freq_interval(void) } } -/* update clock frequency based on MONOTONIC_RAW clock PPS signal +/* + * Update clock frequency based on MONOTONIC_RAW clock PPS signal * timestamps * * At the end of the calibration interval the difference between the @@ -940,7 +947,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) long delta, delta_mod; s64 ftemp; - /* check if the frequency interval was too long */ + /* Check if the frequency interval was too long */ if (freq_norm.sec > (2 << pps_shift)) { time_status |= STA_PPSERROR; pps_errcnt++; @@ -951,9 +958,10 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) return 0; } - /* here the raw frequency offset and wander (stability) is - * calculated. If the wander is less than the wander threshold - * the interval is increased; otherwise it is decreased. + /* + * Here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold the + * interval is increased; otherwise it is decreased. */ ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, freq_norm.sec); @@ -965,13 +973,14 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); - } else { /* good sample */ + } else { + /* Good sample */ pps_inc_freq_interval(); } - /* the stability metric is calculated as the average of recent - * frequency changes, but is used only for performance - * monitoring + /* + * The stability metric is calculated as the average of recent + * frequency changes, but is used only for performance monitoring */ delta_mod = delta; if (delta_mod < 0) @@ -980,7 +989,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) (NTP_SCALE_SHIFT - SHIFT_USEC), NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; - /* if enabled, the system clock frequency is updated */ + /* If enabled, the system clock frequency is updated */ if ((time_status & STA_PPSFREQ) != 0 && (time_status & STA_FREQHOLD) == 0) { time_freq = pps_freq; @@ -990,17 +999,18 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) return delta; } -/* correct REALTIME clock phase error against PPS signal */ +/* Correct REALTIME clock phase error against PPS signal */ static void hardpps_update_phase(long error) { long correction = -error; long jitter; - /* add the sample to the median filter */ + /* Add the sample to the median filter */ pps_phase_filter_add(correction); correction = pps_phase_filter_get(&jitter); - /* Nominal jitter is due to PPS signal noise. If it exceeds the + /* + * Nominal jitter is due to PPS signal noise. If it exceeds the * threshold, the sample is discarded; otherwise, if so enabled, * the time offset is updated. */ @@ -1011,13 +1021,13 @@ static void hardpps_update_phase(long error) time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { - /* correct the time using the phase offset */ + /* Correct the time using the phase offset */ time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); - /* cancel running adjtime() */ + /* Cancel running adjtime() */ time_adjust = 0; } - /* update jitter */ + /* Update jitter */ pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; } @@ -1039,41 +1049,43 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t pts_norm = pps_normalize_ts(*phase_ts); - /* clear the error bits, they will be set again if needed */ + /* Clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); - /* indicate signal presence */ + /* Indicate signal presence */ time_status |= STA_PPSSIGNAL; pps_valid = PPS_VALID; - /* when called for the first time, - * just start the frequency interval */ + /* + * When called for the first time, just start the frequency + * interval + */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; return; } - /* ok, now we have a base for frequency calculation */ + /* Ok, now we have a base for frequency calculation */ freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); - /* check that the signal is in the range - * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ + /* + * Check that the signal is in the range + * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it + */ if ((freq_norm.sec == 0) || (freq_norm.nsec > MAXFREQ * freq_norm.sec) || (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { time_status |= STA_PPSJITTER; - /* restart the frequency calibration interval */ + /* Restart the frequency calibration interval */ pps_fbase = *raw_ts; printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } - /* signal is ok */ - - /* check if the current frequency interval is finished */ + /* Signal is ok. Check if the current frequency interval is finished */ if (freq_norm.sec >= (1 << pps_shift)) { pps_calcnt++; - /* restart the frequency calibration interval */ + /* Restart the frequency calibration interval */ pps_fbase = *raw_ts; hardpps_update_freq(freq_norm); } From 38007dc032bd90920463c5d2e6a27d89f7617d6d Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 11 Sep 2024 15:17:40 +0200 Subject: [PATCH 004/140] ntp: Cleanup formatting of code Code is partially formatted in a creative way which makes reading harder. Examples are function calls over several lines where the indentation does not start at the same height then the open bracket after the function name. Improve formatting but do not make a functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-4-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index e78d3cd1ec321b..bf2f6ee23a2ece 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -428,8 +428,7 @@ int second_overflow(time64_t secs) } else if (secs == ntp_next_leap_sec) { leap = -1; time_state = TIME_OOP; - printk(KERN_NOTICE - "Clock: inserting leap second 23:59:60 UTC\n"); + pr_notice("Clock: inserting leap second 23:59:60 UTC\n"); } break; case TIME_DEL: @@ -440,8 +439,7 @@ int second_overflow(time64_t secs) leap = 1; ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; - printk(KERN_NOTICE - "Clock: deleting leap second 23:59:59 UTC\n"); + pr_notice("Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: @@ -842,10 +840,8 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, txc->tai--; txc->time.tv_sec++; } - if ((time_state == TIME_OOP) && - (ts->tv_sec == ntp_next_leap_sec)) { + if ((time_state == TIME_OOP) && (ts->tv_sec == ntp_next_leap_sec)) result = TIME_WAIT; - } } return result; @@ -952,9 +948,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) time_status |= STA_PPSERROR; pps_errcnt++; pps_dec_freq_interval(); - printk_deferred(KERN_ERR - "hardpps: PPSERROR: interval too long - %lld s\n", - freq_norm.sec); + printk_deferred(KERN_ERR "hardpps: PPSERROR: interval too long - %lld s\n", + freq_norm.sec); return 0; } @@ -968,8 +963,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { - printk_deferred(KERN_WARNING - "hardpps: PPSWANDER: change=%ld\n", delta); + printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); @@ -985,13 +979,11 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) delta_mod = delta; if (delta_mod < 0) delta_mod = -delta_mod; - pps_stabil += (div_s64(((s64)delta_mod) << - (NTP_SCALE_SHIFT - SHIFT_USEC), - NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + pps_stabil += (div_s64(((s64)delta_mod) << (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; /* If enabled, the system clock frequency is updated */ - if ((time_status & STA_PPSFREQ) != 0 && - (time_status & STA_FREQHOLD) == 0) { + if ((time_status & STA_PPSFREQ) && !(time_status & STA_FREQHOLD)) { time_freq = pps_freq; ntp_update_frequency(); } @@ -1015,15 +1007,13 @@ static void hardpps_update_phase(long error) * the time offset is updated. */ if (jitter > (pps_jitter << PPS_POPCORN)) { - printk_deferred(KERN_WARNING - "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + printk_deferred(KERN_WARNING "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", jitter, (pps_jitter << PPS_POPCORN)); time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { /* Correct the time using the phase offset */ - time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, - NTP_INTERVAL_FREQ); + time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); /* Cancel running adjtime() */ time_adjust = 0; } @@ -1072,9 +1062,8 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t * Check that the signal is in the range * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ - if ((freq_norm.sec == 0) || - (freq_norm.nsec > MAXFREQ * freq_norm.sec) || - (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { + if ((freq_norm.sec == 0) || (freq_norm.nsec > MAXFREQ * freq_norm.sec) || + (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { time_status |= STA_PPSJITTER; /* Restart the frequency calibration interval */ pps_fbase = *raw_ts; From 48c3c65f64b01164f1704b40b38f60837d484f13 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:41 +0200 Subject: [PATCH 005/140] ntp: Convert functions with only two states to bool is_error_status() and ntp_synced() return whether a state is set or not. Both functions use unsigned int for it even if it would be a perfect job for a bool. Use bool instead of unsigned int. And while at it, move ntp_synced() function to the place where it is used. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-5-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bf2f6ee23a2ece..905b0216b12dc4 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -171,7 +171,7 @@ static inline void pps_set_freq(s64 freq) pps_freq = freq; } -static inline int is_error_status(int status) +static inline bool is_error_status(int status) { return (status & (STA_UNSYNC|STA_CLOCKERR)) /* @@ -221,7 +221,7 @@ static inline void pps_clear(void) {} static inline void pps_dec_valid(void) {} static inline void pps_set_freq(s64 freq) {} -static inline int is_error_status(int status) +static inline bool is_error_status(int status) { return status & (STA_UNSYNC|STA_CLOCKERR); } @@ -241,21 +241,6 @@ static inline void pps_fill_timex(struct __kernel_timex *txc) #endif /* CONFIG_NTP_PPS */ - -/** - * ntp_synced - Returns 1 if the NTP status is not UNSYNC - * - */ -static inline int ntp_synced(void) -{ - return !(time_status & STA_UNSYNC); -} - - -/* - * NTP methods: - */ - /* * Update tick_length and tick_length_base, based on tick_usec, ntp_tick_adj and * time_freq: @@ -609,6 +594,15 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns } #endif +/** + * ntp_synced - Tells whether the NTP status is not UNSYNC + * Returns: true if not UNSYNC, false otherwise + */ +static inline bool ntp_synced(void) +{ + return !(time_status & STA_UNSYNC); +} + /* * If we have an externally synchronized Linux clock, then update RTC clock * accordingly every ~11 minutes. Generally RTCs can only store second From 136bccbc2e78d3cd0bd8831e4c5a4509c0ddd945 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:42 +0200 Subject: [PATCH 006/140] ntp: Read reference time only once The reference time is required twice in ntp_update_offset(). It will not change in the meantime as the calling code holds the timekeeper lock. Read it only once and store it into a local variable. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-6-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 905b0216b12dc4..0bfd07de202a56 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -283,9 +283,8 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs) static void ntp_update_offset(long offset) { - s64 freq_adj; - s64 offset64; - long secs; + s64 freq_adj, offset64; + long secs, real_secs; if (!(time_status & STA_PLL)) return; @@ -303,11 +302,12 @@ static void ntp_update_offset(long offset) * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - secs = (long)(__ktime_get_real_seconds() - time_reftime); + real_secs = __ktime_get_real_seconds(); + secs = (long)(real_secs - time_reftime); if (unlikely(time_status & STA_FREQHOLD)) secs = 0; - time_reftime = __ktime_get_real_seconds(); + time_reftime = real_secs; offset64 = offset; freq_adj = ntp_update_offset_fll(offset64, secs); From 68f66f97c5689825012877f58df65964056d4b5d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:43 +0200 Subject: [PATCH 007/140] ntp: Introduce struct ntp_data All NTP data is held in static variables. That prevents the NTP code from being reuasble for non-system time timekeepers, e.g. per PTP clock timekeeping. Introduce struct ntp_data and move tick_usec into it for a start. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-7-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 65 ++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 0bfd07de202a56..f95f23385a62ad 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -22,16 +22,19 @@ #include "ntp_internal.h" #include "timekeeping_internal.h" - -/* - * NTP timekeeping variables: +/** + * struct ntp_data - Structure holding all NTP related state + * @tick_usec: USER_HZ period in microseconds * - * Note: All of the NTP state is protected by the timekeeping locks. + * Protected by the timekeeping locks. */ +struct ntp_data { + unsigned long tick_usec; +}; - -/* USER_HZ period (usecs): */ -static unsigned long tick_usec = USER_TICK_USEC; +static struct ntp_data tk_ntp_data = { + .tick_usec = USER_TICK_USEC, +}; static u64 tick_length; static u64 tick_length_base; @@ -245,13 +248,11 @@ static inline void pps_fill_timex(struct __kernel_timex *txc) * Update tick_length and tick_length_base, based on tick_usec, ntp_tick_adj and * time_freq: */ -static void ntp_update_frequency(void) +static void ntp_update_frequency(struct ntp_data *ntpdata) { - u64 second_length; - u64 new_base; + u64 second_length, new_base, tick_usec = (u64)ntpdata->tick_usec; - second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) - << NTP_SCALE_SHIFT; + second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; second_length += ntp_tick_adj; second_length += time_freq; @@ -330,10 +331,7 @@ static void ntp_update_offset(long offset) time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); } -/** - * ntp_clear - Clears the NTP state variables - */ -void ntp_clear(void) +static void __ntp_clear(struct ntp_data *ntpdata) { /* Stop active adjtime() */ time_adjust = 0; @@ -341,7 +339,7 @@ void ntp_clear(void) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - ntp_update_frequency(); + ntp_update_frequency(ntpdata); tick_length = tick_length_base; time_offset = 0; @@ -351,6 +349,14 @@ void ntp_clear(void) pps_clear(); } +/** + * ntp_clear - Clears the NTP state variables + */ +void ntp_clear(void) +{ + __ntp_clear(&tk_ntp_data); +} + u64 ntp_tick_length(void) { @@ -706,7 +712,7 @@ static inline void process_adj_status(const struct __kernel_timex *txc) } -static inline void process_adjtimex_modes(const struct __kernel_timex *txc, +static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct __kernel_timex *txc, s32 *time_tai) { if (txc->modes & ADJ_STATUS) @@ -747,13 +753,12 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, ntp_update_offset(txc->offset); if (txc->modes & ADJ_TICK) - tick_usec = txc->tick; + ntpdata->tick_usec = txc->tick; if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) - ntp_update_frequency(); + ntp_update_frequency(ntpdata); } - /* * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. @@ -761,6 +766,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai, struct audit_ntp_data *ad) { + struct ntp_data *ntpdata = &tk_ntp_data; int result; if (txc->modes & ADJ_ADJTIME) { @@ -769,7 +775,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, if (!(txc->modes & ADJ_OFFSET_READONLY)) { /* adjtime() is independent from ntp_adjtime() */ time_adjust = txc->offset; - ntp_update_frequency(); + ntp_update_frequency(ntpdata); audit_ntp_set_old(ad, AUDIT_NTP_ADJUST, save_adjust); audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, time_adjust); @@ -782,15 +788,15 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq); audit_ntp_set_old(ad, AUDIT_NTP_STATUS, time_status); audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai); - audit_ntp_set_old(ad, AUDIT_NTP_TICK, tick_usec); + audit_ntp_set_old(ad, AUDIT_NTP_TICK, ntpdata->tick_usec); - process_adjtimex_modes(txc, time_tai); + process_adjtimex_modes(ntpdata, txc, time_tai); audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset); audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq); audit_ntp_set_new(ad, AUDIT_NTP_STATUS, time_status); audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai); - audit_ntp_set_new(ad, AUDIT_NTP_TICK, tick_usec); + audit_ntp_set_new(ad, AUDIT_NTP_TICK, ntpdata->tick_usec); } txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, @@ -811,7 +817,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, txc->constant = time_constant; txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; - txc->tick = tick_usec; + txc->tick = ntpdata->tick_usec; txc->tai = *time_tai; /* Fill PPS status fields */ @@ -932,7 +938,7 @@ static inline void pps_inc_freq_interval(void) * too long, the data are discarded. * Returns the difference between old and new frequency values. */ -static long hardpps_update_freq(struct pps_normtime freq_norm) +static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime freq_norm) { long delta, delta_mod; s64 ftemp; @@ -979,7 +985,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) /* If enabled, the system clock frequency is updated */ if ((time_status & STA_PPSFREQ) && !(time_status & STA_FREQHOLD)) { time_freq = pps_freq; - ntp_update_frequency(); + ntp_update_frequency(ntpdata); } return delta; @@ -1030,6 +1036,7 @@ static void hardpps_update_phase(long error) void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { struct pps_normtime pts_norm, freq_norm; + struct ntp_data *ntpdata = &tk_ntp_data; pts_norm = pps_normalize_ts(*phase_ts); @@ -1070,7 +1077,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t pps_calcnt++; /* Restart the frequency calibration interval */ pps_fbase = *raw_ts; - hardpps_update_freq(freq_norm); + hardpps_update_freq(ntpdata, freq_norm); } hardpps_update_phase(pts_norm.nsec); From ec93ec22aa10fb5311c0f068ee66c5b6d39788fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:44 +0200 Subject: [PATCH 008/140] ntp: Move tick_length* into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-8-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f95f23385a62ad..2430e69743ee26 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -25,20 +25,21 @@ /** * struct ntp_data - Structure holding all NTP related state * @tick_usec: USER_HZ period in microseconds + * @tick_length: Adjusted tick length + * @tick_length_base: Base value for @tick_length * * Protected by the timekeeping locks. */ struct ntp_data { unsigned long tick_usec; + u64 tick_length; + u64 tick_length_base; }; static struct ntp_data tk_ntp_data = { .tick_usec = USER_TICK_USEC, }; -static u64 tick_length; -static u64 tick_length_base; - #define SECS_PER_DAY 86400 #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ @@ -263,8 +264,8 @@ static void ntp_update_frequency(struct ntp_data *ntpdata) * Don't wait for the next second_overflow, apply the change to the * tick length immediately: */ - tick_length += new_base - tick_length_base; - tick_length_base = new_base; + ntpdata->tick_length += new_base - ntpdata->tick_length_base; + ntpdata->tick_length_base = new_base; } static inline s64 ntp_update_offset_fll(s64 offset64, long secs) @@ -341,8 +342,8 @@ static void __ntp_clear(struct ntp_data *ntpdata) ntp_update_frequency(ntpdata); - tick_length = tick_length_base; - time_offset = 0; + ntpdata->tick_length = ntpdata->tick_length_base; + time_offset = 0; ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ @@ -360,7 +361,7 @@ void ntp_clear(void) u64 ntp_tick_length(void) { - return tick_length; + return tk_ntp_data.tick_length; } /** @@ -391,6 +392,7 @@ ktime_t ntp_get_next_leap(void) */ int second_overflow(time64_t secs) { + struct ntp_data *ntpdata = &tk_ntp_data; s64 delta; int leap = 0; s32 rem; @@ -451,11 +453,11 @@ int second_overflow(time64_t secs) } /* Compute the phase adjustment for the next second */ - tick_length = tick_length_base; + ntpdata->tick_length = ntpdata->tick_length_base; - delta = ntp_offset_chunk(time_offset); - time_offset -= delta; - tick_length += delta; + delta = ntp_offset_chunk(time_offset); + time_offset -= delta; + ntpdata->tick_length += delta; /* Check PPS signal */ pps_dec_valid(); @@ -465,18 +467,18 @@ int second_overflow(time64_t secs) if (time_adjust > MAX_TICKADJ) { time_adjust -= MAX_TICKADJ; - tick_length += MAX_TICKADJ_SCALED; + ntpdata->tick_length += MAX_TICKADJ_SCALED; goto out; } if (time_adjust < -MAX_TICKADJ) { time_adjust += MAX_TICKADJ; - tick_length -= MAX_TICKADJ_SCALED; + ntpdata->tick_length -= MAX_TICKADJ_SCALED; goto out; } - tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) - << NTP_SCALE_SHIFT; + ntpdata->tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) + << NTP_SCALE_SHIFT; time_adjust = 0; out: From bee18a2301f97465a464176767f3a3a64f900d93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:45 +0200 Subject: [PATCH 009/140] ntp: Move tick_stat* into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-9-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 175 ++++++++++++++++++++++------------------------ 1 file changed, 85 insertions(+), 90 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 2430e69743ee26..42c039ab8139ad 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -27,6 +27,8 @@ * @tick_usec: USER_HZ period in microseconds * @tick_length: Adjusted tick length * @tick_length_base: Base value for @tick_length + * @time_state: State of the clock synchronization + * @time_status: Clock status bits * * Protected by the timekeeping locks. */ @@ -34,10 +36,14 @@ struct ntp_data { unsigned long tick_usec; u64 tick_length; u64 tick_length_base; + int time_state; + int time_status; }; static struct ntp_data tk_ntp_data = { .tick_usec = USER_TICK_USEC, + .time_state = TIME_OK, + .time_status = STA_UNSYNC, }; #define SECS_PER_DAY 86400 @@ -53,16 +59,6 @@ static struct ntp_data tk_ntp_data = { * estimated error = NTP dispersion. */ -/* - * clock synchronization status - * - * (TIME_ERROR prevents overwriting the CMOS clock) - */ -static int time_state = TIME_OK; - -/* clock status bits: */ -static int time_status = STA_UNSYNC; - /* time adjustment (nsecs): */ static s64 time_offset; @@ -127,9 +123,9 @@ static long pps_errcnt; /* calibration errors */ * PPS kernel consumer compensates the whole phase error immediately. * Otherwise, reduce the offset by a fixed factor times the time constant. */ -static inline s64 ntp_offset_chunk(s64 offset) +static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) { - if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + if (ntpdata->time_status & STA_PPSTIME && ntpdata->time_status & STA_PPSSIGNAL) return offset; else return shift_right(offset, SHIFT_PLL + time_constant); @@ -159,13 +155,13 @@ static inline void pps_clear(void) * Decrease pps_valid to indicate that another second has passed since the * last PPS signal. When it reaches 0, indicate that PPS signal is missing. */ -static inline void pps_dec_valid(void) +static inline void pps_dec_valid(struct ntp_data *ntpdata) { if (pps_valid > 0) pps_valid--; else { - time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | - STA_PPSWANDER | STA_PPSERROR); + ntpdata->time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); pps_clear(); } } @@ -198,12 +194,12 @@ static inline bool is_error_status(int status) && (status & (STA_PPSWANDER|STA_PPSERROR))); } -static inline void pps_fill_timex(struct __kernel_timex *txc) +static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_timex *txc) { txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->jitter = pps_jitter; - if (!(time_status & STA_NANO)) + if (!(ntpdata->time_status & STA_NANO)) txc->jitter = pps_jitter / NSEC_PER_USEC; txc->shift = pps_shift; txc->stabil = pps_stabil; @@ -215,14 +211,14 @@ static inline void pps_fill_timex(struct __kernel_timex *txc) #else /* !CONFIG_NTP_PPS */ -static inline s64 ntp_offset_chunk(s64 offset) +static inline s64 ntp_offset_chunk(struct ntp_data *ntp, s64 offset) { return shift_right(offset, SHIFT_PLL + time_constant); } static inline void pps_reset_freq_interval(void) {} static inline void pps_clear(void) {} -static inline void pps_dec_valid(void) {} +static inline void pps_dec_valid(struct ntp_data *ntpdata) {} static inline void pps_set_freq(s64 freq) {} static inline bool is_error_status(int status) @@ -230,7 +226,7 @@ static inline bool is_error_status(int status) return status & (STA_UNSYNC|STA_CLOCKERR); } -static inline void pps_fill_timex(struct __kernel_timex *txc) +static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_timex *txc) { /* PPS is not implemented, so these are zero */ txc->ppsfreq = 0; @@ -268,30 +264,30 @@ static void ntp_update_frequency(struct ntp_data *ntpdata) ntpdata->tick_length_base = new_base; } -static inline s64 ntp_update_offset_fll(s64 offset64, long secs) +static inline s64 ntp_update_offset_fll(struct ntp_data *ntpdata, s64 offset64, long secs) { - time_status &= ~STA_MODE; + ntpdata->time_status &= ~STA_MODE; if (secs < MINSEC) return 0; - if (!(time_status & STA_FLL) && (secs <= MAXSEC)) + if (!(ntpdata->time_status & STA_FLL) && (secs <= MAXSEC)) return 0; - time_status |= STA_MODE; + ntpdata->time_status |= STA_MODE; return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); } -static void ntp_update_offset(long offset) +static void ntp_update_offset(struct ntp_data *ntpdata, long offset) { s64 freq_adj, offset64; long secs, real_secs; - if (!(time_status & STA_PLL)) + if (!(ntpdata->time_status & STA_PLL)) return; - if (!(time_status & STA_NANO)) { + if (!(ntpdata->time_status & STA_NANO)) { /* Make sure the multiplication below won't overflow */ offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC); offset *= NSEC_PER_USEC; @@ -306,13 +302,13 @@ static void ntp_update_offset(long offset) */ real_secs = __ktime_get_real_seconds(); secs = (long)(real_secs - time_reftime); - if (unlikely(time_status & STA_FREQHOLD)) + if (unlikely(ntpdata->time_status & STA_FREQHOLD)) secs = 0; time_reftime = real_secs; offset64 = offset; - freq_adj = ntp_update_offset_fll(offset64, secs); + freq_adj = ntp_update_offset_fll(ntpdata, offset64, secs); /* * Clamp update interval to reduce PLL gain with low @@ -335,10 +331,10 @@ static void ntp_update_offset(long offset) static void __ntp_clear(struct ntp_data *ntpdata) { /* Stop active adjtime() */ - time_adjust = 0; - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; + time_adjust = 0; + ntpdata->time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; ntp_update_frequency(ntpdata); @@ -372,9 +368,10 @@ u64 ntp_tick_length(void) */ ktime_t ntp_get_next_leap(void) { + struct ntp_data *ntpdata = &tk_ntp_data; ktime_t ret; - if ((time_state == TIME_INS) && (time_status & STA_INS)) + if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) return ktime_set(ntp_next_leap_sec, 0); ret = KTIME_MAX; return ret; @@ -402,46 +399,46 @@ int second_overflow(time64_t secs) * day, the system clock is set back one second; if in leap-delete * state, the system clock is set ahead one second. */ - switch (time_state) { + switch (ntpdata->time_state) { case TIME_OK: - if (time_status & STA_INS) { - time_state = TIME_INS; + if (ntpdata->time_status & STA_INS) { + ntpdata->time_state = TIME_INS; div_s64_rem(secs, SECS_PER_DAY, &rem); ntp_next_leap_sec = secs + SECS_PER_DAY - rem; - } else if (time_status & STA_DEL) { - time_state = TIME_DEL; + } else if (ntpdata->time_status & STA_DEL) { + ntpdata->time_state = TIME_DEL; div_s64_rem(secs + 1, SECS_PER_DAY, &rem); ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } break; case TIME_INS: - if (!(time_status & STA_INS)) { + if (!(ntpdata->time_status & STA_INS)) { ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_OK; + ntpdata->time_state = TIME_OK; } else if (secs == ntp_next_leap_sec) { leap = -1; - time_state = TIME_OOP; + ntpdata->time_state = TIME_OOP; pr_notice("Clock: inserting leap second 23:59:60 UTC\n"); } break; case TIME_DEL: - if (!(time_status & STA_DEL)) { + if (!(ntpdata->time_status & STA_DEL)) { ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_OK; + ntpdata->time_state = TIME_OK; } else if (secs == ntp_next_leap_sec) { leap = 1; ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_WAIT; + ntpdata->time_state = TIME_WAIT; pr_notice("Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_WAIT; + ntpdata->time_state = TIME_WAIT; break; case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; + if (!(ntpdata->time_status & (STA_INS | STA_DEL))) + ntpdata->time_state = TIME_OK; break; } @@ -449,18 +446,18 @@ int second_overflow(time64_t secs) time_maxerror += MAXFREQ / NSEC_PER_USEC; if (time_maxerror > NTP_PHASE_LIMIT) { time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; + ntpdata->time_status |= STA_UNSYNC; } /* Compute the phase adjustment for the next second */ ntpdata->tick_length = ntpdata->tick_length_base; - delta = ntp_offset_chunk(time_offset); + delta = ntp_offset_chunk(ntpdata, time_offset); time_offset -= delta; ntpdata->tick_length += delta; /* Check PPS signal */ - pps_dec_valid(); + pps_dec_valid(ntpdata); if (!time_adjust) goto out; @@ -608,7 +605,7 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns */ static inline bool ntp_synced(void) { - return !(time_status & STA_UNSYNC); + return !(tk_ntp_data.time_status & STA_UNSYNC); } /* @@ -691,11 +688,11 @@ static inline void __init ntp_init_cmos_sync(void) { } /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(const struct __kernel_timex *txc) +static inline void process_adj_status(struct ntp_data *ntpdata, const struct __kernel_timex *txc) { - if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { - time_state = TIME_OK; - time_status = STA_UNSYNC; + if ((ntpdata->time_status & STA_PLL) && !(txc->status & STA_PLL)) { + ntpdata->time_state = TIME_OK; + ntpdata->time_status = STA_UNSYNC; ntp_next_leap_sec = TIME64_MAX; /* Restart PPS frequency calibration */ pps_reset_freq_interval(); @@ -705,26 +702,25 @@ static inline void process_adj_status(const struct __kernel_timex *txc) * If we turn on PLL adjustments then reset the * reference time to current time. */ - if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) + if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL)) time_reftime = __ktime_get_real_seconds(); - /* Only set allowed bits */ - time_status &= STA_RONLY; - time_status |= txc->status & ~STA_RONLY; + /* only set allowed bits */ + ntpdata->time_status &= STA_RONLY; + ntpdata->time_status |= txc->status & ~STA_RONLY; } - static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct __kernel_timex *txc, s32 *time_tai) { if (txc->modes & ADJ_STATUS) - process_adj_status(txc); + process_adj_status(ntpdata, txc); if (txc->modes & ADJ_NANO) - time_status |= STA_NANO; + ntpdata->time_status |= STA_NANO; if (txc->modes & ADJ_MICRO) - time_status &= ~STA_NANO; + ntpdata->time_status &= ~STA_NANO; if (txc->modes & ADJ_FREQUENCY) { time_freq = txc->freq * PPM_SCALE; @@ -742,17 +738,16 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct if (txc->modes & ADJ_TIMECONST) { time_constant = clamp(txc->constant, 0, MAXTC); - if (!(time_status & STA_NANO)) + if (!(ntpdata->time_status & STA_NANO)) time_constant += 4; time_constant = clamp(time_constant, 0, MAXTC); } - if (txc->modes & ADJ_TAI && - txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) + if (txc->modes & ADJ_TAI && txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) - ntp_update_offset(txc->offset); + ntp_update_offset(ntpdata, txc->offset); if (txc->modes & ADJ_TICK) ntpdata->tick_usec = txc->tick; @@ -788,7 +783,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, if (txc->modes) { audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, time_offset); audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq); - audit_ntp_set_old(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_old(ad, AUDIT_NTP_STATUS, ntpdata->time_status); audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai); audit_ntp_set_old(ad, AUDIT_NTP_TICK, ntpdata->tick_usec); @@ -796,26 +791,26 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset); audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq); - audit_ntp_set_new(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_new(ad, AUDIT_NTP_STATUS, ntpdata->time_status); audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai); audit_ntp_set_new(ad, AUDIT_NTP_TICK, ntpdata->tick_usec); } txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) + if (!(ntpdata->time_status & STA_NANO)) txc->offset = (u32)txc->offset / NSEC_PER_USEC; } - result = time_state; - if (is_error_status(time_status)) + result = ntpdata->time_state; + if (is_error_status(ntpdata->time_status)) result = TIME_ERROR; txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; - txc->status = time_status; + txc->status = ntpdata->time_status; txc->constant = time_constant; txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; @@ -823,26 +818,26 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, txc->tai = *time_tai; /* Fill PPS status fields */ - pps_fill_timex(txc); + pps_fill_timex(ntpdata, txc); txc->time.tv_sec = ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; - if (!(time_status & STA_NANO)) + if (!(ntpdata->time_status & STA_NANO)) txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; /* Handle leapsec adjustments */ if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { - if ((time_state == TIME_INS) && (time_status & STA_INS)) { + if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) { result = TIME_OOP; txc->tai++; txc->time.tv_sec--; } - if ((time_state == TIME_DEL) && (time_status & STA_DEL)) { + if ((ntpdata->time_state == TIME_DEL) && (ntpdata->time_status & STA_DEL)) { result = TIME_WAIT; txc->tai--; txc->time.tv_sec++; } - if ((time_state == TIME_OOP) && (ts->tv_sec == ntp_next_leap_sec)) + if ((ntpdata->time_state == TIME_OOP) && (ts->tv_sec == ntp_next_leap_sec)) result = TIME_WAIT; } @@ -947,7 +942,7 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr /* Check if the frequency interval was too long */ if (freq_norm.sec > (2 << pps_shift)) { - time_status |= STA_PPSERROR; + ntpdata->time_status |= STA_PPSERROR; pps_errcnt++; pps_dec_freq_interval(); printk_deferred(KERN_ERR "hardpps: PPSERROR: interval too long - %lld s\n", @@ -966,7 +961,7 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); - time_status |= STA_PPSWANDER; + ntpdata->time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); } else { @@ -985,7 +980,7 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; /* If enabled, the system clock frequency is updated */ - if ((time_status & STA_PPSFREQ) && !(time_status & STA_FREQHOLD)) { + if ((ntpdata->time_status & STA_PPSFREQ) && !(ntpdata->time_status & STA_FREQHOLD)) { time_freq = pps_freq; ntp_update_frequency(ntpdata); } @@ -994,7 +989,7 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr } /* Correct REALTIME clock phase error against PPS signal */ -static void hardpps_update_phase(long error) +static void hardpps_update_phase(struct ntp_data *ntpdata, long error) { long correction = -error; long jitter; @@ -1011,9 +1006,9 @@ static void hardpps_update_phase(long error) if (jitter > (pps_jitter << PPS_POPCORN)) { printk_deferred(KERN_WARNING "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", jitter, (pps_jitter << PPS_POPCORN)); - time_status |= STA_PPSJITTER; + ntpdata->time_status |= STA_PPSJITTER; pps_jitcnt++; - } else if (time_status & STA_PPSTIME) { + } else if (ntpdata->time_status & STA_PPSTIME) { /* Correct the time using the phase offset */ time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); /* Cancel running adjtime() */ @@ -1043,10 +1038,10 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t pts_norm = pps_normalize_ts(*phase_ts); /* Clear the error bits, they will be set again if needed */ - time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + ntpdata->time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); - /* Indicate signal presence */ - time_status |= STA_PPSSIGNAL; + /* indicate signal presence */ + ntpdata->time_status |= STA_PPSSIGNAL; pps_valid = PPS_VALID; /* @@ -1067,7 +1062,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t */ if ((freq_norm.sec == 0) || (freq_norm.nsec > MAXFREQ * freq_norm.sec) || (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { - time_status |= STA_PPSJITTER; + ntpdata->time_status |= STA_PPSJITTER; /* Restart the frequency calibration interval */ pps_fbase = *raw_ts; printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); @@ -1082,7 +1077,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t hardpps_update_freq(ntpdata, freq_norm); } - hardpps_update_phase(pts_norm.nsec); + hardpps_update_phase(ntpdata, pts_norm.nsec); } #endif /* CONFIG_NTP_PPS */ From d51435548e4c406395d7cc479820a0a962d65af6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:46 +0200 Subject: [PATCH 010/140] ntp: Move time_offset/constant into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-10-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 49 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 42c039ab8139ad..5a6c3254090166 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -29,6 +29,8 @@ * @tick_length_base: Base value for @tick_length * @time_state: State of the clock synchronization * @time_status: Clock status bits + * @time_offset: Time adjustment in nanoseconds + * @time_constant: PLL time constant * * Protected by the timekeeping locks. */ @@ -38,12 +40,15 @@ struct ntp_data { u64 tick_length_base; int time_state; int time_status; + s64 time_offset; + long time_constant; }; static struct ntp_data tk_ntp_data = { .tick_usec = USER_TICK_USEC, .time_state = TIME_OK, .time_status = STA_UNSYNC, + .time_constant = 2, }; #define SECS_PER_DAY 86400 @@ -59,12 +64,6 @@ static struct ntp_data tk_ntp_data = { * estimated error = NTP dispersion. */ -/* time adjustment (nsecs): */ -static s64 time_offset; - -/* pll time constant: */ -static long time_constant = 2; - /* maximum error (usecs): */ static long time_maxerror = NTP_PHASE_LIMIT; @@ -128,7 +127,7 @@ static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) if (ntpdata->time_status & STA_PPSTIME && ntpdata->time_status & STA_PPSSIGNAL) return offset; else - return shift_right(offset, SHIFT_PLL + time_constant); + return shift_right(offset, SHIFT_PLL + ntpdata->time_constant); } static inline void pps_reset_freq_interval(void) @@ -211,9 +210,9 @@ static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_time #else /* !CONFIG_NTP_PPS */ -static inline s64 ntp_offset_chunk(struct ntp_data *ntp, s64 offset) +static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) { - return shift_right(offset, SHIFT_PLL + time_constant); + return shift_right(offset, SHIFT_PLL + ntpdata->time_constant); } static inline void pps_reset_freq_interval(void) {} @@ -315,17 +314,17 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset) * sampling rate (e.g. intermittent network connection) * to avoid instability. */ - if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) - secs = 1 << (SHIFT_PLL + 1 + time_constant); + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + ntpdata->time_constant))) + secs = 1 << (SHIFT_PLL + 1 + ntpdata->time_constant); freq_adj += (offset64 * secs) << - (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + ntpdata->time_constant)); freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); time_freq = max(freq_adj, -MAXFREQ_SCALED); - time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); + ntpdata->time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); } static void __ntp_clear(struct ntp_data *ntpdata) @@ -339,7 +338,7 @@ static void __ntp_clear(struct ntp_data *ntpdata) ntp_update_frequency(ntpdata); ntpdata->tick_length = ntpdata->tick_length_base; - time_offset = 0; + ntpdata->time_offset = 0; ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ @@ -452,8 +451,8 @@ int second_overflow(time64_t secs) /* Compute the phase adjustment for the next second */ ntpdata->tick_length = ntpdata->tick_length_base; - delta = ntp_offset_chunk(ntpdata, time_offset); - time_offset -= delta; + delta = ntp_offset_chunk(ntpdata, ntpdata->time_offset); + ntpdata->time_offset -= delta; ntpdata->tick_length += delta; /* Check PPS signal */ @@ -737,10 +736,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_TIMECONST) { - time_constant = clamp(txc->constant, 0, MAXTC); + ntpdata->time_constant = clamp(txc->constant, 0, MAXTC); if (!(ntpdata->time_status & STA_NANO)) - time_constant += 4; - time_constant = clamp(time_constant, 0, MAXTC); + ntpdata->time_constant += 4; + ntpdata->time_constant = clamp(ntpdata->time_constant, 0, MAXTC); } if (txc->modes & ADJ_TAI && txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) @@ -781,7 +780,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, } else { /* If there are input parameters, then process them: */ if (txc->modes) { - audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, ntpdata->time_offset); audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq); audit_ntp_set_old(ad, AUDIT_NTP_STATUS, ntpdata->time_status); audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai); @@ -789,15 +788,14 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, process_adjtimex_modes(ntpdata, txc, time_tai); - audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, ntpdata->time_offset); audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq); audit_ntp_set_new(ad, AUDIT_NTP_STATUS, ntpdata->time_status); audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai); audit_ntp_set_new(ad, AUDIT_NTP_TICK, ntpdata->tick_usec); } - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); + txc->offset = shift_right(ntpdata->time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); if (!(ntpdata->time_status & STA_NANO)) txc->offset = (u32)txc->offset / NSEC_PER_USEC; } @@ -811,7 +809,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = ntpdata->time_status; - txc->constant = time_constant; + txc->constant = ntpdata->time_constant; txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; txc->tick = ntpdata->tick_usec; @@ -1010,7 +1008,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) pps_jitcnt++; } else if (ntpdata->time_status & STA_PPSTIME) { /* Correct the time using the phase offset */ - time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); + ntpdata->time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, + NTP_INTERVAL_FREQ); /* Cancel running adjtime() */ time_adjust = 0; } From 7891cf2961c0e99e026d911cbf1ec4aeb938750d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:47 +0200 Subject: [PATCH 011/140] ntp: Move time_max/esterror into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-11-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5a6c3254090166..67c411707acda6 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -31,6 +31,9 @@ * @time_status: Clock status bits * @time_offset: Time adjustment in nanoseconds * @time_constant: PLL time constant + * @time_maxerror: Maximum error in microseconds holding the NTP sync distance + * (NTP dispersion + delay / 2) + * @time_esterror: Estimated error in microseconds holding NTP dispersion * * Protected by the timekeeping locks. */ @@ -42,6 +45,8 @@ struct ntp_data { int time_status; s64 time_offset; long time_constant; + long time_maxerror; + long time_esterror; }; static struct ntp_data tk_ntp_data = { @@ -49,6 +54,8 @@ static struct ntp_data tk_ntp_data = { .time_state = TIME_OK, .time_status = STA_UNSYNC, .time_constant = 2, + .time_maxerror = NTP_PHASE_LIMIT, + .time_esterror = NTP_PHASE_LIMIT, }; #define SECS_PER_DAY 86400 @@ -57,19 +64,6 @@ static struct ntp_data tk_ntp_data = { (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) #define MAX_TAI_OFFSET 100000 -/* - * phase-lock loop variables - * - * Note: maximum error = NTP sync distance = dispersion + delay / 2; - * estimated error = NTP dispersion. - */ - -/* maximum error (usecs): */ -static long time_maxerror = NTP_PHASE_LIMIT; - -/* estimated error (usecs): */ -static long time_esterror = NTP_PHASE_LIMIT; - /* frequency offset (scaled nsecs/secs): */ static s64 time_freq; @@ -332,8 +326,8 @@ static void __ntp_clear(struct ntp_data *ntpdata) /* Stop active adjtime() */ time_adjust = 0; ntpdata->time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; + ntpdata->time_maxerror = NTP_PHASE_LIMIT; + ntpdata->time_esterror = NTP_PHASE_LIMIT; ntp_update_frequency(ntpdata); @@ -442,9 +436,9 @@ int second_overflow(time64_t secs) } /* Bump the maxerror field */ - time_maxerror += MAXFREQ / NSEC_PER_USEC; - if (time_maxerror > NTP_PHASE_LIMIT) { - time_maxerror = NTP_PHASE_LIMIT; + ntpdata->time_maxerror += MAXFREQ / NSEC_PER_USEC; + if (ntpdata->time_maxerror > NTP_PHASE_LIMIT) { + ntpdata->time_maxerror = NTP_PHASE_LIMIT; ntpdata->time_status |= STA_UNSYNC; } @@ -730,10 +724,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct } if (txc->modes & ADJ_MAXERROR) - time_maxerror = clamp(txc->maxerror, 0, NTP_PHASE_LIMIT); + ntpdata->time_maxerror = clamp(txc->maxerror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_ESTERROR) - time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); + ntpdata->time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_TIMECONST) { ntpdata->time_constant = clamp(txc->constant, 0, MAXTC); @@ -806,8 +800,8 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); - txc->maxerror = time_maxerror; - txc->esterror = time_esterror; + txc->maxerror = ntpdata->time_maxerror; + txc->esterror = ntpdata->time_esterror; txc->status = ntpdata->time_status; txc->constant = ntpdata->time_constant; txc->precision = 1; From 161b8ec281c38d8747f0ae033126208698cad33f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:48 +0200 Subject: [PATCH 012/140] ntp: Move time_freq/reftime into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-12-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 67c411707acda6..5bce6a406f3869 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -34,6 +34,8 @@ * @time_maxerror: Maximum error in microseconds holding the NTP sync distance * (NTP dispersion + delay / 2) * @time_esterror: Estimated error in microseconds holding NTP dispersion + * @time_freq: Frequency offset scaled nsecs/secs + * @time_reftime: Time at last adjustment in seconds * * Protected by the timekeeping locks. */ @@ -47,6 +49,8 @@ struct ntp_data { long time_constant; long time_maxerror; long time_esterror; + s64 time_freq; + time64_t time_reftime; }; static struct ntp_data tk_ntp_data = { @@ -64,12 +68,6 @@ static struct ntp_data tk_ntp_data = { (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) #define MAX_TAI_OFFSET 100000 -/* frequency offset (scaled nsecs/secs): */ -static s64 time_freq; - -/* time at last adjustment (secs): */ -static time64_t time_reftime; - static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ @@ -245,7 +243,7 @@ static void ntp_update_frequency(struct ntp_data *ntpdata) second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; second_length += ntp_tick_adj; - second_length += time_freq; + second_length += ntpdata->time_freq; new_base = div_u64(second_length, NTP_INTERVAL_FREQ); @@ -294,11 +292,11 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset) * and in which mode (PLL or FLL). */ real_secs = __ktime_get_real_seconds(); - secs = (long)(real_secs - time_reftime); + secs = (long)(real_secs - ntpdata->time_reftime); if (unlikely(ntpdata->time_status & STA_FREQHOLD)) secs = 0; - time_reftime = real_secs; + ntpdata->time_reftime = real_secs; offset64 = offset; freq_adj = ntp_update_offset_fll(ntpdata, offset64, secs); @@ -314,9 +312,9 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset) freq_adj += (offset64 * secs) << (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + ntpdata->time_constant)); - freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); + freq_adj = min(freq_adj + ntpdata->time_freq, MAXFREQ_SCALED); - time_freq = max(freq_adj, -MAXFREQ_SCALED); + ntpdata->time_freq = max(freq_adj, -MAXFREQ_SCALED); ntpdata->time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); } @@ -696,7 +694,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k * reference time to current time. */ if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL)) - time_reftime = __ktime_get_real_seconds(); + ntpdata->time_reftime = __ktime_get_real_seconds(); /* only set allowed bits */ ntpdata->time_status &= STA_RONLY; @@ -716,11 +714,11 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct ntpdata->time_status &= ~STA_NANO; if (txc->modes & ADJ_FREQUENCY) { - time_freq = txc->freq * PPM_SCALE; - time_freq = min(time_freq, MAXFREQ_SCALED); - time_freq = max(time_freq, -MAXFREQ_SCALED); + ntpdata->time_freq = txc->freq * PPM_SCALE; + ntpdata->time_freq = min(ntpdata->time_freq, MAXFREQ_SCALED); + ntpdata->time_freq = max(ntpdata->time_freq, -MAXFREQ_SCALED); /* Update pps_freq */ - pps_set_freq(time_freq); + pps_set_freq(ntpdata->time_freq); } if (txc->modes & ADJ_MAXERROR) @@ -775,7 +773,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, /* If there are input parameters, then process them: */ if (txc->modes) { audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, ntpdata->time_offset); - audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_old(ad, AUDIT_NTP_FREQ, ntpdata->time_freq); audit_ntp_set_old(ad, AUDIT_NTP_STATUS, ntpdata->time_status); audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai); audit_ntp_set_old(ad, AUDIT_NTP_TICK, ntpdata->tick_usec); @@ -783,7 +781,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, process_adjtimex_modes(ntpdata, txc, time_tai); audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, ntpdata->time_offset); - audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_new(ad, AUDIT_NTP_FREQ, ntpdata->time_freq); audit_ntp_set_new(ad, AUDIT_NTP_STATUS, ntpdata->time_status); audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai); audit_ntp_set_new(ad, AUDIT_NTP_TICK, ntpdata->tick_usec); @@ -798,7 +796,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, if (is_error_status(ntpdata->time_status)) result = TIME_ERROR; - txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + txc->freq = shift_right((ntpdata->time_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = ntpdata->time_maxerror; txc->esterror = ntpdata->time_esterror; @@ -973,7 +971,7 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr /* If enabled, the system clock frequency is updated */ if ((ntpdata->time_status & STA_PPSFREQ) && !(ntpdata->time_status & STA_FREQHOLD)) { - time_freq = pps_freq; + ntpdata->time_freq = pps_freq; ntp_update_frequency(ntpdata); } From bb6400a298d8bab8074a9e78ae778ce7b238493d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:49 +0200 Subject: [PATCH 013/140] ntp: Move time_adj/ntp_tick_adj into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-13-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5bce6a406f3869..f9c2f267f12ebb 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -36,6 +36,8 @@ * @time_esterror: Estimated error in microseconds holding NTP dispersion * @time_freq: Frequency offset scaled nsecs/secs * @time_reftime: Time at last adjustment in seconds + * @time_adjust: Adjustment value + * @ntp_tick_adj: Constant boot-param configurable NTP tick adjustment (upscaled) * * Protected by the timekeeping locks. */ @@ -51,6 +53,8 @@ struct ntp_data { long time_esterror; s64 time_freq; time64_t time_reftime; + long time_adjust; + s64 ntp_tick_adj; }; static struct ntp_data tk_ntp_data = { @@ -68,11 +72,6 @@ static struct ntp_data tk_ntp_data = { (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) #define MAX_TAI_OFFSET 100000 -static long time_adjust; - -/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ -static s64 ntp_tick_adj; - /* second value of the next pending leapsecond, or TIME64_MAX if no leap */ static time64_t ntp_next_leap_sec = TIME64_MAX; @@ -242,7 +241,7 @@ static void ntp_update_frequency(struct ntp_data *ntpdata) second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; - second_length += ntp_tick_adj; + second_length += ntpdata->ntp_tick_adj; second_length += ntpdata->time_freq; new_base = div_u64(second_length, NTP_INTERVAL_FREQ); @@ -322,7 +321,7 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset) static void __ntp_clear(struct ntp_data *ntpdata) { /* Stop active adjtime() */ - time_adjust = 0; + ntpdata->time_adjust = 0; ntpdata->time_status |= STA_UNSYNC; ntpdata->time_maxerror = NTP_PHASE_LIMIT; ntpdata->time_esterror = NTP_PHASE_LIMIT; @@ -450,24 +449,24 @@ int second_overflow(time64_t secs) /* Check PPS signal */ pps_dec_valid(ntpdata); - if (!time_adjust) + if (!ntpdata->time_adjust) goto out; - if (time_adjust > MAX_TICKADJ) { - time_adjust -= MAX_TICKADJ; + if (ntpdata->time_adjust > MAX_TICKADJ) { + ntpdata->time_adjust -= MAX_TICKADJ; ntpdata->tick_length += MAX_TICKADJ_SCALED; goto out; } - if (time_adjust < -MAX_TICKADJ) { - time_adjust += MAX_TICKADJ; + if (ntpdata->time_adjust < -MAX_TICKADJ) { + ntpdata->time_adjust += MAX_TICKADJ; ntpdata->tick_length -= MAX_TICKADJ_SCALED; goto out; } - ntpdata->tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) + ntpdata->tick_length += (s64)(ntpdata->time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; - time_adjust = 0; + ntpdata->time_adjust = 0; out: return leap; @@ -758,15 +757,15 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, int result; if (txc->modes & ADJ_ADJTIME) { - long save_adjust = time_adjust; + long save_adjust = ntpdata->time_adjust; if (!(txc->modes & ADJ_OFFSET_READONLY)) { /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; + ntpdata->time_adjust = txc->offset; ntp_update_frequency(ntpdata); audit_ntp_set_old(ad, AUDIT_NTP_ADJUST, save_adjust); - audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, time_adjust); + audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, ntpdata->time_adjust); } txc->offset = save_adjust; } else { @@ -1003,7 +1002,7 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) ntpdata->time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); /* Cancel running adjtime() */ - time_adjust = 0; + ntpdata->time_adjust = 0; } /* Update jitter */ pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; @@ -1075,11 +1074,11 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t static int __init ntp_tick_adj_setup(char *str) { - int rc = kstrtos64(str, 0, &ntp_tick_adj); + int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj); if (rc) return rc; - ntp_tick_adj <<= NTP_SCALE_SHIFT; + tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } From 75d956b947b7fc99df80a0db6677cdc30e70f75b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:50 +0200 Subject: [PATCH 014/140] ntp: Move ntp_next_leap_sec into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-14-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f9c2f267f12ebb..f156114f30c533 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -38,6 +38,7 @@ * @time_reftime: Time at last adjustment in seconds * @time_adjust: Adjustment value * @ntp_tick_adj: Constant boot-param configurable NTP tick adjustment (upscaled) + * @ntp_next_leap_sec: Second value of the next pending leapsecond, or TIME64_MAX if no leap * * Protected by the timekeeping locks. */ @@ -55,6 +56,7 @@ struct ntp_data { time64_t time_reftime; long time_adjust; s64 ntp_tick_adj; + time64_t ntp_next_leap_sec; }; static struct ntp_data tk_ntp_data = { @@ -64,6 +66,7 @@ static struct ntp_data tk_ntp_data = { .time_constant = 2, .time_maxerror = NTP_PHASE_LIMIT, .time_esterror = NTP_PHASE_LIMIT, + .ntp_next_leap_sec = TIME64_MAX, }; #define SECS_PER_DAY 86400 @@ -72,9 +75,6 @@ static struct ntp_data tk_ntp_data = { (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) #define MAX_TAI_OFFSET 100000 -/* second value of the next pending leapsecond, or TIME64_MAX if no leap */ -static time64_t ntp_next_leap_sec = TIME64_MAX; - #ifdef CONFIG_NTP_PPS /* @@ -331,7 +331,7 @@ static void __ntp_clear(struct ntp_data *ntpdata) ntpdata->tick_length = ntpdata->tick_length_base; ntpdata->time_offset = 0; - ntp_next_leap_sec = TIME64_MAX; + ntpdata->ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ pps_clear(); } @@ -362,7 +362,7 @@ ktime_t ntp_get_next_leap(void) ktime_t ret; if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) - return ktime_set(ntp_next_leap_sec, 0); + return ktime_set(ntpdata->ntp_next_leap_sec, 0); ret = KTIME_MAX; return ret; } @@ -394,18 +394,18 @@ int second_overflow(time64_t secs) if (ntpdata->time_status & STA_INS) { ntpdata->time_state = TIME_INS; div_s64_rem(secs, SECS_PER_DAY, &rem); - ntp_next_leap_sec = secs + SECS_PER_DAY - rem; + ntpdata->ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } else if (ntpdata->time_status & STA_DEL) { ntpdata->time_state = TIME_DEL; div_s64_rem(secs + 1, SECS_PER_DAY, &rem); - ntp_next_leap_sec = secs + SECS_PER_DAY - rem; + ntpdata->ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } break; case TIME_INS: if (!(ntpdata->time_status & STA_INS)) { - ntp_next_leap_sec = TIME64_MAX; + ntpdata->ntp_next_leap_sec = TIME64_MAX; ntpdata->time_state = TIME_OK; - } else if (secs == ntp_next_leap_sec) { + } else if (secs == ntpdata->ntp_next_leap_sec) { leap = -1; ntpdata->time_state = TIME_OOP; pr_notice("Clock: inserting leap second 23:59:60 UTC\n"); @@ -413,17 +413,17 @@ int second_overflow(time64_t secs) break; case TIME_DEL: if (!(ntpdata->time_status & STA_DEL)) { - ntp_next_leap_sec = TIME64_MAX; + ntpdata->ntp_next_leap_sec = TIME64_MAX; ntpdata->time_state = TIME_OK; - } else if (secs == ntp_next_leap_sec) { + } else if (secs == ntpdata->ntp_next_leap_sec) { leap = 1; - ntp_next_leap_sec = TIME64_MAX; + ntpdata->ntp_next_leap_sec = TIME64_MAX; ntpdata->time_state = TIME_WAIT; pr_notice("Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: - ntp_next_leap_sec = TIME64_MAX; + ntpdata->ntp_next_leap_sec = TIME64_MAX; ntpdata->time_state = TIME_WAIT; break; case TIME_WAIT: @@ -683,7 +683,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k if ((ntpdata->time_status & STA_PLL) && !(txc->status & STA_PLL)) { ntpdata->time_state = TIME_OK; ntpdata->time_status = STA_UNSYNC; - ntp_next_leap_sec = TIME64_MAX; + ntpdata->ntp_next_leap_sec = TIME64_MAX; /* Restart PPS frequency calibration */ pps_reset_freq_interval(); } @@ -815,7 +815,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; /* Handle leapsec adjustments */ - if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { + if (unlikely(ts->tv_sec >= ntpdata->ntp_next_leap_sec)) { if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) { result = TIME_OOP; txc->tai++; @@ -826,7 +826,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, txc->tai--; txc->time.tv_sec++; } - if ((ntpdata->time_state == TIME_OOP) && (ts->tv_sec == ntp_next_leap_sec)) + if ((ntpdata->time_state == TIME_OOP) && (ts->tv_sec == ntpdata->ntp_next_leap_sec)) result = TIME_WAIT; } From 931a177f7027ad0066c071912873a7a24e63240d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:51 +0200 Subject: [PATCH 015/140] ntp: Move pps_valid into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-15-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f156114f30c533..ad65ba28f4e7a0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -40,6 +40,8 @@ * @ntp_tick_adj: Constant boot-param configurable NTP tick adjustment (upscaled) * @ntp_next_leap_sec: Second value of the next pending leapsecond, or TIME64_MAX if no leap * + * @pps_valid: PPS signal watchdog counter + * * Protected by the timekeeping locks. */ struct ntp_data { @@ -57,6 +59,9 @@ struct ntp_data { long time_adjust; s64 ntp_tick_adj; time64_t ntp_next_leap_sec; +#ifdef CONFIG_NTP_PPS + int pps_valid; +#endif }; static struct ntp_data tk_ntp_data = { @@ -91,7 +96,6 @@ static struct ntp_data tk_ntp_data = { intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -static int pps_valid; /* signal watchdog counter */ static long pps_tf[3]; /* phase median filter */ static long pps_jitter; /* current jitter (ns) */ static struct timespec64 pps_fbase; /* beginning of the last freq interval */ @@ -147,9 +151,9 @@ static inline void pps_clear(void) */ static inline void pps_dec_valid(struct ntp_data *ntpdata) { - if (pps_valid > 0) - pps_valid--; - else { + if (ntpdata->pps_valid > 0) { + ntpdata->pps_valid--; + } else { ntpdata->time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); pps_clear(); @@ -1032,7 +1036,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t /* indicate signal presence */ ntpdata->time_status |= STA_PPSSIGNAL; - pps_valid = PPS_VALID; + ntpdata->pps_valid = PPS_VALID; /* * When called for the first time, just start the frequency From 5cc953b8ae0b2b7d0ebc7c3c0105e73ffaa03085 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:52 +0200 Subject: [PATCH 016/140] ntp: Move pps_ft into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-16-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index ad65ba28f4e7a0..6a1ba27473fc21 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -41,6 +41,7 @@ * @ntp_next_leap_sec: Second value of the next pending leapsecond, or TIME64_MAX if no leap * * @pps_valid: PPS signal watchdog counter + * @pps_tf: PPS phase median filter * * Protected by the timekeeping locks. */ @@ -61,6 +62,7 @@ struct ntp_data { time64_t ntp_next_leap_sec; #ifdef CONFIG_NTP_PPS int pps_valid; + long pps_tf[3]; #endif }; @@ -96,7 +98,6 @@ static struct ntp_data tk_ntp_data = { intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -static long pps_tf[3]; /* phase median filter */ static long pps_jitter; /* current jitter (ns) */ static struct timespec64 pps_fbase; /* beginning of the last freq interval */ static int pps_shift; /* current interval duration (s) (shift) */ @@ -134,13 +135,14 @@ static inline void pps_reset_freq_interval(void) /** * pps_clear - Clears the PPS state variables + * @ntpdata: Pointer to ntp data */ -static inline void pps_clear(void) +static inline void pps_clear(struct ntp_data *ntpdata) { pps_reset_freq_interval(); - pps_tf[0] = 0; - pps_tf[1] = 0; - pps_tf[2] = 0; + ntpdata->pps_tf[0] = 0; + ntpdata->pps_tf[1] = 0; + ntpdata->pps_tf[2] = 0; pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; pps_freq = 0; } @@ -156,7 +158,7 @@ static inline void pps_dec_valid(struct ntp_data *ntpdata) } else { ntpdata->time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); - pps_clear(); + pps_clear(ntpdata); } } @@ -211,7 +213,7 @@ static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) } static inline void pps_reset_freq_interval(void) {} -static inline void pps_clear(void) {} +static inline void pps_clear(struct ntp_data *ntpdata) {} static inline void pps_dec_valid(struct ntp_data *ntpdata) {} static inline void pps_set_freq(s64 freq) {} @@ -337,7 +339,7 @@ static void __ntp_clear(struct ntp_data *ntpdata) ntpdata->ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ - pps_clear(); + pps_clear(ntpdata); } /** @@ -870,22 +872,22 @@ static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) } /* Get current phase correction and jitter */ -static inline long pps_phase_filter_get(long *jitter) +static inline long pps_phase_filter_get(struct ntp_data *ntpdata, long *jitter) { - *jitter = pps_tf[0] - pps_tf[1]; + *jitter = ntpdata->pps_tf[0] - ntpdata->pps_tf[1]; if (*jitter < 0) *jitter = -*jitter; /* TODO: test various filters */ - return pps_tf[0]; + return ntpdata->pps_tf[0]; } /* Add the sample to the phase filter */ -static inline void pps_phase_filter_add(long err) +static inline void pps_phase_filter_add(struct ntp_data *ntpdata, long err) { - pps_tf[2] = pps_tf[1]; - pps_tf[1] = pps_tf[0]; - pps_tf[0] = err; + ntpdata->pps_tf[2] = ntpdata->pps_tf[1]; + ntpdata->pps_tf[1] = ntpdata->pps_tf[0]; + ntpdata->pps_tf[0] = err; } /* @@ -988,8 +990,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) long jitter; /* Add the sample to the median filter */ - pps_phase_filter_add(correction); - correction = pps_phase_filter_get(&jitter); + pps_phase_filter_add(ntpdata, correction); + correction = pps_phase_filter_get(ntpdata, &jitter); /* * Nominal jitter is due to PPS signal noise. If it exceeds the From 9d7130dfc0e1c53112fcbed4b9f566d0f6fbc949 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:53 +0200 Subject: [PATCH 017/140] ntp: Move pps_jitter into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-17-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 6a1ba27473fc21..576f86a6a4f17b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -42,6 +42,7 @@ * * @pps_valid: PPS signal watchdog counter * @pps_tf: PPS phase median filter + * @pps_jitter: PPS current jitter in nanoseconds * * Protected by the timekeeping locks. */ @@ -63,6 +64,7 @@ struct ntp_data { #ifdef CONFIG_NTP_PPS int pps_valid; long pps_tf[3]; + long pps_jitter; #endif }; @@ -98,7 +100,6 @@ static struct ntp_data tk_ntp_data = { intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -static long pps_jitter; /* current jitter (ns) */ static struct timespec64 pps_fbase; /* beginning of the last freq interval */ static int pps_shift; /* current interval duration (s) (shift) */ static int pps_intcnt; /* interval counter */ @@ -194,9 +195,9 @@ static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_time { txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); - txc->jitter = pps_jitter; + txc->jitter = ntpdata->pps_jitter; if (!(ntpdata->time_status & STA_NANO)) - txc->jitter = pps_jitter / NSEC_PER_USEC; + txc->jitter = ntpdata->pps_jitter / NSEC_PER_USEC; txc->shift = pps_shift; txc->stabil = pps_stabil; txc->jitcnt = pps_jitcnt; @@ -998,9 +999,9 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) * threshold, the sample is discarded; otherwise, if so enabled, * the time offset is updated. */ - if (jitter > (pps_jitter << PPS_POPCORN)) { + if (jitter > (ntpdata->pps_jitter << PPS_POPCORN)) { printk_deferred(KERN_WARNING "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", - jitter, (pps_jitter << PPS_POPCORN)); + jitter, (ntpdata->pps_jitter << PPS_POPCORN)); ntpdata->time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (ntpdata->time_status & STA_PPSTIME) { @@ -1011,7 +1012,7 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) ntpdata->time_adjust = 0; } /* Update jitter */ - pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; + ntpdata->pps_jitter += (jitter - ntpdata->pps_jitter) >> PPS_INTMIN; } /* From db45e9bce8df2396740c0c03906ad6ed63948a8b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:54 +0200 Subject: [PATCH 018/140] ntp: Move pps_fbase into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-18-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 576f86a6a4f17b..4bde69c4841dfa 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -43,6 +43,7 @@ * @pps_valid: PPS signal watchdog counter * @pps_tf: PPS phase median filter * @pps_jitter: PPS current jitter in nanoseconds + * @pps_fbase: PPS beginning of the last freq interval * * Protected by the timekeeping locks. */ @@ -65,6 +66,7 @@ struct ntp_data { int pps_valid; long pps_tf[3]; long pps_jitter; + struct timespec64 pps_fbase; #endif }; @@ -100,7 +102,6 @@ static struct ntp_data tk_ntp_data = { intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -static struct timespec64 pps_fbase; /* beginning of the last freq interval */ static int pps_shift; /* current interval duration (s) (shift) */ static int pps_intcnt; /* interval counter */ static s64 pps_freq; /* frequency offset (scaled ns/s) */ @@ -144,7 +145,7 @@ static inline void pps_clear(struct ntp_data *ntpdata) ntpdata->pps_tf[0] = 0; ntpdata->pps_tf[1] = 0; ntpdata->pps_tf[2] = 0; - pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; + ntpdata->pps_fbase.tv_sec = ntpdata->pps_fbase.tv_nsec = 0; pps_freq = 0; } @@ -1045,13 +1046,13 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t * When called for the first time, just start the frequency * interval */ - if (unlikely(pps_fbase.tv_sec == 0)) { - pps_fbase = *raw_ts; + if (unlikely(ntpdata->pps_fbase.tv_sec == 0)) { + ntpdata->pps_fbase = *raw_ts; return; } /* Ok, now we have a base for frequency calculation */ - freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); + freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, ntpdata->pps_fbase)); /* * Check that the signal is in the range @@ -1061,7 +1062,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { ntpdata->time_status |= STA_PPSJITTER; /* Restart the frequency calibration interval */ - pps_fbase = *raw_ts; + ntpdata->pps_fbase = *raw_ts; printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } @@ -1070,7 +1071,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t if (freq_norm.sec >= (1 << pps_shift)) { pps_calcnt++; /* Restart the frequency calibration interval */ - pps_fbase = *raw_ts; + ntpdata->pps_fbase = *raw_ts; hardpps_update_freq(ntpdata, freq_norm); } From b1c89a762f753bedd5a62be4a5a586281be6f3c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:55 +0200 Subject: [PATCH 019/140] ntp: Move pps_shift/intcnt into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-19-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 54 ++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4bde69c4841dfa..bebff6c69c1851 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -44,6 +44,8 @@ * @pps_tf: PPS phase median filter * @pps_jitter: PPS current jitter in nanoseconds * @pps_fbase: PPS beginning of the last freq interval + * @pps_shift: PPS current interval duration in seconds (shift value) + * @pps_intcnt: PPS interval counter * * Protected by the timekeeping locks. */ @@ -67,6 +69,8 @@ struct ntp_data { long pps_tf[3]; long pps_jitter; struct timespec64 pps_fbase; + int pps_shift; + int pps_intcnt; #endif }; @@ -102,8 +106,6 @@ static struct ntp_data tk_ntp_data = { intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -static int pps_shift; /* current interval duration (s) (shift) */ -static int pps_intcnt; /* interval counter */ static s64 pps_freq; /* frequency offset (scaled ns/s) */ static long pps_stabil; /* current stability (scaled ns/s) */ @@ -128,11 +130,11 @@ static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) return shift_right(offset, SHIFT_PLL + ntpdata->time_constant); } -static inline void pps_reset_freq_interval(void) +static inline void pps_reset_freq_interval(struct ntp_data *ntpdata) { /* The PPS calibration interval may end surprisingly early */ - pps_shift = PPS_INTMIN; - pps_intcnt = 0; + ntpdata->pps_shift = PPS_INTMIN; + ntpdata->pps_intcnt = 0; } /** @@ -141,7 +143,7 @@ static inline void pps_reset_freq_interval(void) */ static inline void pps_clear(struct ntp_data *ntpdata) { - pps_reset_freq_interval(); + pps_reset_freq_interval(ntpdata); ntpdata->pps_tf[0] = 0; ntpdata->pps_tf[1] = 0; ntpdata->pps_tf[2] = 0; @@ -199,7 +201,7 @@ static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_time txc->jitter = ntpdata->pps_jitter; if (!(ntpdata->time_status & STA_NANO)) txc->jitter = ntpdata->pps_jitter / NSEC_PER_USEC; - txc->shift = pps_shift; + txc->shift = ntpdata->pps_shift; txc->stabil = pps_stabil; txc->jitcnt = pps_jitcnt; txc->calcnt = pps_calcnt; @@ -214,7 +216,7 @@ static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) return shift_right(offset, SHIFT_PLL + ntpdata->time_constant); } -static inline void pps_reset_freq_interval(void) {} +static inline void pps_reset_freq_interval(struct ntp_data *ntpdata) {} static inline void pps_clear(struct ntp_data *ntpdata) {} static inline void pps_dec_valid(struct ntp_data *ntpdata) {} static inline void pps_set_freq(s64 freq) {} @@ -693,7 +695,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k ntpdata->time_status = STA_UNSYNC; ntpdata->ntp_next_leap_sec = TIME64_MAX; /* Restart PPS frequency calibration */ - pps_reset_freq_interval(); + pps_reset_freq_interval(ntpdata); } /* @@ -896,13 +898,13 @@ static inline void pps_phase_filter_add(struct ntp_data *ntpdata, long err) * Decrease frequency calibration interval length. It is halved after four * consecutive unstable intervals. */ -static inline void pps_dec_freq_interval(void) +static inline void pps_dec_freq_interval(struct ntp_data *ntpdata) { - if (--pps_intcnt <= -PPS_INTCOUNT) { - pps_intcnt = -PPS_INTCOUNT; - if (pps_shift > PPS_INTMIN) { - pps_shift--; - pps_intcnt = 0; + if (--ntpdata->pps_intcnt <= -PPS_INTCOUNT) { + ntpdata->pps_intcnt = -PPS_INTCOUNT; + if (ntpdata->pps_shift > PPS_INTMIN) { + ntpdata->pps_shift--; + ntpdata->pps_intcnt = 0; } } } @@ -911,13 +913,13 @@ static inline void pps_dec_freq_interval(void) * Increase frequency calibration interval length. It is doubled after * four consecutive stable intervals. */ -static inline void pps_inc_freq_interval(void) +static inline void pps_inc_freq_interval(struct ntp_data *ntpdata) { - if (++pps_intcnt >= PPS_INTCOUNT) { - pps_intcnt = PPS_INTCOUNT; - if (pps_shift < PPS_INTMAX) { - pps_shift++; - pps_intcnt = 0; + if (++ntpdata->pps_intcnt >= PPS_INTCOUNT) { + ntpdata->pps_intcnt = PPS_INTCOUNT; + if (ntpdata->pps_shift < PPS_INTMAX) { + ntpdata->pps_shift++; + ntpdata->pps_intcnt = 0; } } } @@ -938,10 +940,10 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr s64 ftemp; /* Check if the frequency interval was too long */ - if (freq_norm.sec > (2 << pps_shift)) { + if (freq_norm.sec > (2 << ntpdata->pps_shift)) { ntpdata->time_status |= STA_PPSERROR; pps_errcnt++; - pps_dec_freq_interval(); + pps_dec_freq_interval(ntpdata); printk_deferred(KERN_ERR "hardpps: PPSERROR: interval too long - %lld s\n", freq_norm.sec); return 0; @@ -960,10 +962,10 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); ntpdata->time_status |= STA_PPSWANDER; pps_stbcnt++; - pps_dec_freq_interval(); + pps_dec_freq_interval(ntpdata); } else { /* Good sample */ - pps_inc_freq_interval(); + pps_inc_freq_interval(ntpdata); } /* @@ -1068,7 +1070,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t } /* Signal is ok. Check if the current frequency interval is finished */ - if (freq_norm.sec >= (1 << pps_shift)) { + if (freq_norm.sec >= (1 << ntpdata->pps_shift)) { pps_calcnt++; /* Restart the frequency calibration interval */ ntpdata->pps_fbase = *raw_ts; From 12850b46583440911a2789355d25d8eb9fe8157d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:56 +0200 Subject: [PATCH 020/140] ntp: Move pps_freq/stabil into ntp_data Continue the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-20-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bebff6c69c1851..533367d7ccccab 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -46,6 +46,8 @@ * @pps_fbase: PPS beginning of the last freq interval * @pps_shift: PPS current interval duration in seconds (shift value) * @pps_intcnt: PPS interval counter + * @pps_freq: PPS frequency offset in scaled ns/s + * @pps_stabil: PPS current stability in scaled ns/s * * Protected by the timekeeping locks. */ @@ -71,6 +73,8 @@ struct ntp_data { struct timespec64 pps_fbase; int pps_shift; int pps_intcnt; + s64 pps_freq; + long pps_stabil; #endif }; @@ -106,9 +110,6 @@ static struct ntp_data tk_ntp_data = { intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -static s64 pps_freq; /* frequency offset (scaled ns/s) */ -static long pps_stabil; /* current stability (scaled ns/s) */ - /* * PPS signal quality monitors */ @@ -148,7 +149,7 @@ static inline void pps_clear(struct ntp_data *ntpdata) ntpdata->pps_tf[1] = 0; ntpdata->pps_tf[2] = 0; ntpdata->pps_fbase.tv_sec = ntpdata->pps_fbase.tv_nsec = 0; - pps_freq = 0; + ntpdata->pps_freq = 0; } /* @@ -166,9 +167,9 @@ static inline void pps_dec_valid(struct ntp_data *ntpdata) } } -static inline void pps_set_freq(s64 freq) +static inline void pps_set_freq(struct ntp_data *ntpdata) { - pps_freq = freq; + ntpdata->pps_freq = ntpdata->time_freq; } static inline bool is_error_status(int status) @@ -196,13 +197,13 @@ static inline bool is_error_status(int status) static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_timex *txc) { - txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + txc->ppsfreq = shift_right((ntpdata->pps_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->jitter = ntpdata->pps_jitter; if (!(ntpdata->time_status & STA_NANO)) txc->jitter = ntpdata->pps_jitter / NSEC_PER_USEC; txc->shift = ntpdata->pps_shift; - txc->stabil = pps_stabil; + txc->stabil = ntpdata->pps_stabil; txc->jitcnt = pps_jitcnt; txc->calcnt = pps_calcnt; txc->errcnt = pps_errcnt; @@ -219,7 +220,7 @@ static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) static inline void pps_reset_freq_interval(struct ntp_data *ntpdata) {} static inline void pps_clear(struct ntp_data *ntpdata) {} static inline void pps_dec_valid(struct ntp_data *ntpdata) {} -static inline void pps_set_freq(s64 freq) {} +static inline void pps_set_freq(struct ntp_data *ntpdata) {} static inline bool is_error_status(int status) { @@ -727,7 +728,7 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct ntpdata->time_freq = min(ntpdata->time_freq, MAXFREQ_SCALED); ntpdata->time_freq = max(ntpdata->time_freq, -MAXFREQ_SCALED); /* Update pps_freq */ - pps_set_freq(ntpdata->time_freq); + pps_set_freq(ntpdata); } if (txc->modes & ADJ_MAXERROR) @@ -956,8 +957,8 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr */ ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, freq_norm.sec); - delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); - pps_freq = ftemp; + delta = shift_right(ftemp - ntpdata->pps_freq, NTP_SCALE_SHIFT); + ntpdata->pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); ntpdata->time_status |= STA_PPSWANDER; @@ -975,12 +976,12 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr delta_mod = delta; if (delta_mod < 0) delta_mod = -delta_mod; - pps_stabil += (div_s64(((s64)delta_mod) << (NTP_SCALE_SHIFT - SHIFT_USEC), - NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + ntpdata->pps_stabil += (div_s64(((s64)delta_mod) << (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - ntpdata->pps_stabil) >> PPS_INTMIN; /* If enabled, the system clock frequency is updated */ if ((ntpdata->time_status & STA_PPSFREQ) && !(ntpdata->time_status & STA_FREQHOLD)) { - ntpdata->time_freq = pps_freq; + ntpdata->time_freq = ntpdata->pps_freq; ntp_update_frequency(ntpdata); } From 6fadb4a61d3fd4cdc6ede38a911b4abbfb43eed4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:57 +0200 Subject: [PATCH 021/140] ntp: Move pps monitors into ntp_data Finalize the conversion from static variables to struct based data. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-21-2d52f4e13476@linutronix.de --- kernel/time/ntp.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 533367d7ccccab..b550ebe0f03b22 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -48,6 +48,10 @@ * @pps_intcnt: PPS interval counter * @pps_freq: PPS frequency offset in scaled ns/s * @pps_stabil: PPS current stability in scaled ns/s + * @pps_calcnt: PPS monitor: calibration intervals + * @pps_jitcnt: PPS monitor: jitter limit exceeded + * @pps_stbcnt: PPS monitor: stability limit exceeded + * @pps_errcnt: PPS monitor: calibration errors * * Protected by the timekeeping locks. */ @@ -75,6 +79,10 @@ struct ntp_data { int pps_intcnt; s64 pps_freq; long pps_stabil; + long pps_calcnt; + long pps_jitcnt; + long pps_stbcnt; + long pps_errcnt; #endif }; @@ -110,15 +118,6 @@ static struct ntp_data tk_ntp_data = { intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -/* - * PPS signal quality monitors - */ -static long pps_calcnt; /* calibration intervals */ -static long pps_jitcnt; /* jitter limit exceeded */ -static long pps_stbcnt; /* stability limit exceeded */ -static long pps_errcnt; /* calibration errors */ - - /* * PPS kernel consumer compensates the whole phase error immediately. * Otherwise, reduce the offset by a fixed factor times the time constant. @@ -204,10 +203,10 @@ static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_time txc->jitter = ntpdata->pps_jitter / NSEC_PER_USEC; txc->shift = ntpdata->pps_shift; txc->stabil = ntpdata->pps_stabil; - txc->jitcnt = pps_jitcnt; - txc->calcnt = pps_calcnt; - txc->errcnt = pps_errcnt; - txc->stbcnt = pps_stbcnt; + txc->jitcnt = ntpdata->pps_jitcnt; + txc->calcnt = ntpdata->pps_calcnt; + txc->errcnt = ntpdata->pps_errcnt; + txc->stbcnt = ntpdata->pps_stbcnt; } #else /* !CONFIG_NTP_PPS */ @@ -943,7 +942,7 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr /* Check if the frequency interval was too long */ if (freq_norm.sec > (2 << ntpdata->pps_shift)) { ntpdata->time_status |= STA_PPSERROR; - pps_errcnt++; + ntpdata->pps_errcnt++; pps_dec_freq_interval(ntpdata); printk_deferred(KERN_ERR "hardpps: PPSERROR: interval too long - %lld s\n", freq_norm.sec); @@ -962,7 +961,7 @@ static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime fr if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); ntpdata->time_status |= STA_PPSWANDER; - pps_stbcnt++; + ntpdata->pps_stbcnt++; pps_dec_freq_interval(ntpdata); } else { /* Good sample */ @@ -1007,7 +1006,7 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) printk_deferred(KERN_WARNING "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", jitter, (ntpdata->pps_jitter << PPS_POPCORN)); ntpdata->time_status |= STA_PPSJITTER; - pps_jitcnt++; + ntpdata->pps_jitcnt++; } else if (ntpdata->time_status & STA_PPSTIME) { /* Correct the time using the phase offset */ ntpdata->time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, @@ -1072,7 +1071,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t /* Signal is ok. Check if the current frequency interval is finished */ if (freq_norm.sec >= (1 << ntpdata->pps_shift)) { - pps_calcnt++; + ntpdata->pps_calcnt++; /* Restart the frequency calibration interval */ ntpdata->pps_fbase = *raw_ts; hardpps_update_freq(ntpdata, freq_norm); From 8102c4daf44ab86c2d2226a8136bec905d6e2bd1 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 11 Sep 2024 10:30:20 +0100 Subject: [PATCH 022/140] timekeeping: Add the boot clock to system time snapshot For tracing purpose, the boot clock is interesting as it doesn't stop on suspend. Export it as part of the time snapshot. This will later allow the hypervisor to add boot clock timestamps to its events. Signed-off-by: Vincent Donnefort Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911093029.3279154-5-vdonnefort@google.com --- include/linux/timekeeping.h | 2 ++ kernel/time/timekeeping.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index fc12a9ba2c8842..e85c27347e44fd 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -275,6 +275,7 @@ struct ktime_timestamps { * counter value * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time + * @boot: Boot time * @raw: Monotonic raw system time * @cs_id: Clocksource ID * @clock_was_set_seq: The sequence number of clock-was-set events @@ -283,6 +284,7 @@ struct ktime_timestamps { struct system_time_snapshot { u64 cycles; ktime_t real; + ktime_t boot; ktime_t raw; enum clocksource_ids cs_id; unsigned int clock_was_set_seq; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7e6f409bf3114a..47e44b9d267116 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1060,6 +1060,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) unsigned int seq; ktime_t base_raw; ktime_t base_real; + ktime_t base_boot; u64 nsec_raw; u64 nsec_real; u64 now; @@ -1074,6 +1075,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); + base_boot = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_boot); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); @@ -1081,6 +1084,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } EXPORT_SYMBOL_GPL(ktime_get_snapshot); From 8c111f1b967687f47bb0cfbedf2863b62c23223c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 10 Sep 2024 13:43:34 -0400 Subject: [PATCH 023/140] timekeeping: Don't use seqcount loop in ktime_mono_to_any() on 64-bit systems ktime_mono_to_any() only fetches the offset inside the loop. This is a single word on 64-bit CPUs, and seqcount_read_begin() implies a full SMP barrier. Use READ_ONCE() to fetch the offset instead of doing a seqcount loop on 64-bit and add the matching WRITE_ONCE()'s to update the offsets in tk_set_wall_to_mono() and tk_update_sleep_time(). [ tglx: Get rid of the #ifdeffery ] Signed-off-by: Jeff Layton Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240910-mgtime-v3-1-84406ed53fad@kernel.org --- kernel/time/timekeeping.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 47e44b9d267116..a57f2eed2ce676 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -161,13 +161,15 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); - tk->offs_real = timespec64_to_ktime(tmp); - tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); + /* Paired with READ_ONCE() in ktime_mono_to_any() */ + WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp)); + WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0))); } static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* Paired with READ_ONCE() in ktime_mono_to_any() */ + WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta)); /* * Timespec representation for VDSO update to avoid 64bit division * on every update. @@ -930,6 +932,14 @@ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) unsigned int seq; ktime_t tconv; + if (IS_ENABLED(CONFIG_64BIT)) { + /* + * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and + * tk_update_sleep_time(). + */ + return ktime_add(tmono, READ_ONCE(*offset)); + } + do { seq = read_seqcount_begin(&tk_core.seq); tconv = ktime_add(tmono, *offset); From 70c8fd00a9bd0509bbf7bccd9baea8bbd5ddc756 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Oct 2024 17:27:16 -0400 Subject: [PATCH 024/140] timekeeping: Add interfaces for handling timestamps with a floor value Multigrain timestamps allow the kernel to use fine-grained timestamps when an inode's attributes is being actively observed via ->getattr(). With this support, it's possible for a file to get a fine-grained timestamp, and another modified after it to get a coarse-grained stamp that is earlier than the fine-grained time. If this happens then the files can appear to have been modified in reverse order, which breaks VFS ordering guarantees [1]. To prevent this, maintain a floor value for multigrain timestamps. Whenever a fine-grained timestamp is handed out, record it, and when later coarse-grained stamps are handed out, ensure they are not earlier than that value. If the coarse-grained timestamp is earlier than the fine-grained floor, return the floor value instead. Add a static singleton atomic64_t into timekeeper.c that is used to keep track of the latest fine-grained time ever handed out. This is tracked as a monotonic ktime_t value to ensure that it isn't affected by clock jumps. Because it is updated at different times than the rest of the timekeeper object, the floor value is managed independently of the timekeeper via a cmpxchg() operation, and sits on its own cacheline. Add two new public interfaces: - ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the coarse-grained clock and the floor time - ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries to swap it into the floor. A timespec64 is filled with the result. The floor value is global and updated via a single try_cmpxchg(). If that fails then the operation raced with a concurrent update. Any concurrent update must be later than the existing floor value, so any racing tasks can accept any resulting floor value without retrying. [1]: POSIX requires that files be stamped with realtime clock values, and makes no provision for dealing with backward clock jumps. If a backward realtime clock jump occurs, then files can appear to have been modified in reverse order. Signed-off-by: Jeff Layton Signed-off-by: Thomas Gleixner Tested-by: Randy Dunlap # documentation bits Acked-by: John Stultz Link: https://lore.kernel.org/all/20241002-mgtime-v10-1-d1c4717f5284@kernel.org --- include/linux/timekeeping.h | 4 ++ kernel/time/timekeeping.c | 104 ++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index fc12a9ba2c8842..7aa85246c18357 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -45,6 +45,10 @@ extern void ktime_get_real_ts64(struct timespec64 *tv); extern void ktime_get_coarse_ts64(struct timespec64 *ts); extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); +/* Multigrain timestamp interfaces */ +extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); +extern void ktime_get_real_ts64_mg(struct timespec64 *ts); + void getboottime64(struct timespec64 *ts); /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7e6f409bf3114a..441792c907fadc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -114,6 +114,23 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +/* + * Multigrain timestamps require tracking the latest fine-grained timestamp + * that has been issued, and never returning a coarse-grained timestamp that is + * earlier than that value. + * + * mg_floor represents the latest fine-grained time that has been handed out as + * a file timestamp on the system. This is tracked as a monotonic ktime_t, and + * converted to a realtime clock value on an as-needed basis. + * + * Maintaining mg_floor ensures the multigrain interfaces never issue a + * timestamp earlier than one that has been previously issued. + * + * The exception to this rule is when there is a backward realtime clock jump. If + * such an event occurs, a timestamp can appear to be earlier than a previous one. + */ +static __cacheline_aligned_in_smp atomic64_t mg_floor; + static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { @@ -2394,6 +2411,93 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); +/** + * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor + * @ts: timespec64 to be filled + * + * Fetch the global mg_floor value, convert it to realtime and compare it + * to the current coarse-grained time. Fill @ts with whichever is + * latest. Note that this is a filesystem-specific interface and should be + * avoided outside of that context. + */ +void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 floor = atomic64_read(&mg_floor); + ktime_t f_real, offset, coarse; + unsigned int seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + *ts = tk_xtime(tk); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + coarse = timespec64_to_ktime(*ts); + f_real = ktime_add(floor, offset); + if (ktime_after(f_real, coarse)) + *ts = ktime_to_timespec64(f_real); +} + +/** + * ktime_get_real_ts64_mg - attempt to update floor value and return result + * @ts: pointer to the timespec to be set + * + * Get a monotonic fine-grained time value and attempt to swap it into + * mg_floor. If that succeeds then accept the new floor value. If it fails + * then another task raced in during the interim time and updated the + * floor. Since any update to the floor must be later than the previous + * floor, either outcome is acceptable. + * + * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), + * and determining that the resulting coarse-grained timestamp did not effect + * a change in ctime. Any more recent floor value would effect a change to + * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. + * + * @ts will be filled with the latest floor value, regardless of the outcome of + * the cmpxchg. Note that this is a filesystem specific interface and should be + * avoided outside of that context. + */ +void ktime_get_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t old = atomic64_read(&mg_floor); + ktime_t offset, mono; + unsigned int seq; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ts->tv_sec = tk->xtime_sec; + mono = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + mono = ktime_add_ns(mono, nsecs); + + /* + * Attempt to update the floor with the new time value. As any + * update must be later then the existing floor, and would effect + * a change to ctime from the perspective of the current task, + * accept the resulting floor value regardless of the outcome of + * the swap. + */ + if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); + } else { + /* + * Another task changed mg_floor since "old" was fetched. + * "old" has been updated with the latest value of "mg_floor". + * That value is newer than the previous floor value, which + * is enough to effect a change to ctime. Accept it. + */ + *ts = ktime_to_timespec64(ktime_add(old, offset)); + } +} + void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; From 96f9a366ec8abe027326d7aab84d64370019f0f1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Oct 2024 17:27:17 -0400 Subject: [PATCH 025/140] timekeeping: Add percpu counter for tracking floor swap events The mgtime_floor value is a global variable for tracking the latest fine-grained timestamp handed out. Because it's a global, track the number of times that a new floor value is assigned. Add a new percpu counter to the timekeeping code to track the number of floor swap events that have occurred. A later patch will add a debugfs file to display this counter alongside other stats involving multigrain timestamps. Signed-off-by: Jeff Layton Signed-off-by: Thomas Gleixner Tested-by: Randy Dunlap # documentation bits Link: https://lore.kernel.org/all/20241002-mgtime-v10-2-d1c4717f5284@kernel.org --- include/linux/timekeeping.h | 1 + kernel/time/timekeeping.c | 1 + kernel/time/timekeeping_debug.c | 13 +++++++++++++ kernel/time/timekeeping_internal.h | 15 +++++++++++++++ 4 files changed, 30 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7aa85246c18357..84a035e86ac811 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -48,6 +48,7 @@ extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); /* Multigrain timestamp interfaces */ extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); extern void ktime_get_real_ts64_mg(struct timespec64 *ts); +extern unsigned long timekeeping_get_mg_floor_swaps(void); void getboottime64(struct timespec64 *ts); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 441792c907fadc..962b2a31f01586 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2487,6 +2487,7 @@ void ktime_get_real_ts64_mg(struct timespec64 *ts) if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); + timekeeping_inc_mg_floor_swaps(); } else { /* * Another task changed mg_floor since "old" was fetched. diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index b73e8850e58d9c..badeb222eab92a 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -17,6 +17,9 @@ #define NUM_BINS 32 +/* Incremented every time mg_floor is updated */ +DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_sleep_time_show(struct seq_file *s, void *data) @@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t) (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); } +unsigned long timekeeping_get_mg_floor_swaps(void) +{ + unsigned long sum = 0; + int cpu; + + for_each_possible_cpu(cpu) + sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu)); + + return sum; +} diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 4ca2787d1642e2..0bbae825bc0226 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -10,9 +10,24 @@ * timekeeping debug functions */ #ifdef CONFIG_DEBUG_FS + +DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ + this_cpu_inc(timekeeping_mg_floor_swaps); +} + extern void tk_debug_account_sleep_time(const struct timespec64 *t); + #else + #define tk_debug_account_sleep_time(x) + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ +} + #endif #ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE From bafffd56c608106d11e7aec851f114dcd66b2091 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 10 Oct 2024 14:54:46 +0100 Subject: [PATCH 026/140] clocksource: Remove unused clocksource_change_rating clocksource_change_rating() has been unused since 2017's commit 63ed4e0c67df ("Drivers: hv: vmbus: Consolidate all Hyper-V specific clocksource code") Remove it. __clocksource_change_rating now only has one use which is ifdef'd. Move it into the ifdef'd section. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241010135446.213098-1-linux@treblig.org --- include/linux/clocksource.h | 1 - kernel/time/clocksource.c | 40 ++++++++++--------------------------- 2 files changed, 10 insertions(+), 31 deletions(-) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index d35b677b08fe13..ef1b16da6ad52e 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -215,7 +215,6 @@ static inline s64 clocksource_cyc2ns(u64 cycles, u32 mult, u32 shift) extern int clocksource_unregister(struct clocksource*); extern void clocksource_touch_watchdog(void); -extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_suspend(void); extern void clocksource_resume(void); extern struct clocksource * __init clocksource_default_clock(void); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 23336eecb4f43b..aab6472853fab9 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -20,6 +20,8 @@ #include "tick-internal.h" #include "timekeeping_internal.h" +static void clocksource_enqueue(struct clocksource *cs); + static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end) { u64 delta = clocksource_delta(end, start, cs->mask); @@ -171,7 +173,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags) } static int clocksource_watchdog_kthread(void *data); -static void __clocksource_change_rating(struct clocksource *cs, int rating); static void clocksource_watchdog_work(struct work_struct *work) { @@ -191,6 +192,13 @@ static void clocksource_watchdog_work(struct work_struct *work) kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); } +static void clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); @@ -697,7 +705,7 @@ static int __clocksource_watchdog_kthread(void) list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); - __clocksource_change_rating(cs, 0); + clocksource_change_rating(cs, 0); select = 1; } if (cs->flags & CLOCK_SOURCE_RESELECT) { @@ -1255,34 +1263,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) } EXPORT_SYMBOL_GPL(__clocksource_register_scale); -static void __clocksource_change_rating(struct clocksource *cs, int rating) -{ - list_del(&cs->list); - cs->rating = rating; - clocksource_enqueue(cs); -} - -/** - * clocksource_change_rating - Change the rating of a registered clocksource - * @cs: clocksource to be changed - * @rating: new rating - */ -void clocksource_change_rating(struct clocksource *cs, int rating) -{ - unsigned long flags; - - mutex_lock(&clocksource_mutex); - clocksource_watchdog_lock(&flags); - __clocksource_change_rating(cs, rating); - clocksource_watchdog_unlock(&flags); - - clocksource_select(); - clocksource_select_watchdog(false); - clocksource_suspend_select(false); - mutex_unlock(&clocksource_mutex); -} -EXPORT_SYMBOL(clocksource_change_rating); - /* * Unbind clocksource @cs. Called with clocksource_mutex held */ From a849881a9e5426cb4fa00660529bc501718ef85b Mon Sep 17 00:00:00 2001 From: Wang Jinchao Date: Wed, 9 Oct 2024 10:21:35 +0800 Subject: [PATCH 027/140] time: Remove '%' from numeric constant in kernel-doc comment Change %0 to 0 in kernel-doc comments. %0 is not valid. Signed-off-by: Wang Jinchao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241009022135.92400-2-wangjinchao@xfusion.com --- kernel/time/time.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/time/time.c b/kernel/time/time.c index 642647f5046be0..5984d4a5639bd5 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -866,7 +866,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, * * Handles compat or 32-bit modes. * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_timespec64(struct timespec64 *ts, const struct __kernel_timespec __user *uts) @@ -897,7 +897,7 @@ EXPORT_SYMBOL_GPL(get_timespec64); * @ts: input &struct timespec64 * @uts: user's &struct __kernel_timespec * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_timespec64(const struct timespec64 *ts, struct __kernel_timespec __user *uts) @@ -944,7 +944,7 @@ static int __put_old_timespec32(const struct timespec64 *ts64, * * Handles X86_X32_ABI compatibility conversion. * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { @@ -963,7 +963,7 @@ EXPORT_SYMBOL_GPL(get_old_timespec32); * * Handles X86_X32_ABI compatibility conversion. * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { @@ -979,7 +979,7 @@ EXPORT_SYMBOL_GPL(put_old_timespec32); * @it: destination &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) @@ -1002,7 +1002,7 @@ EXPORT_SYMBOL_GPL(get_itimerspec64); * @it: input &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_itimerspec64(const struct itimerspec64 *it, struct __kernel_itimerspec __user *uit) @@ -1024,7 +1024,7 @@ EXPORT_SYMBOL_GPL(put_itimerspec64); * @its: destination &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_old_itimerspec32(struct itimerspec64 *its, const struct old_itimerspec32 __user *uits) @@ -1043,7 +1043,7 @@ EXPORT_SYMBOL_GPL(get_old_itimerspec32); * @its: input &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_old_itimerspec32(const struct itimerspec64 *its, struct old_itimerspec32 __user *uits) From 3a2e83d350950a84dddb0094c92e380f31fd5333 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:18 +0200 Subject: [PATCH 028/140] MAINTAINERS: Add missing file include/linux/delay.h include/linux/delay.h is not covered by MAINTAINERS file. Add it to the "HIGH-RESOLUTION TIMERS, TIMER WHEEL, CLOCKEVENTS" section. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-1-dc8b907cb62f@linutronix.de --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c27f3190737f8b..b52362566629d6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10162,6 +10162,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core F: Documentation/timers/ F: include/linux/clockchips.h +F: include/linux/delay.h F: include/linux/hrtimer.h F: include/linux/timer.h F: kernel/time/clockevents.c From da7bd0a9e0fce9f293b6e30c003f8f3978cee923 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:19 +0200 Subject: [PATCH 029/140] timers: Move *sleep*() and timeout functions into a separate file All schedule_timeout() and *sleep*() related functions are interfaces on top of timer list timers and hrtimers to add a sleep to the code. As they are built on top of the timer list timers and hrtimers, the [hr]timer interfaces are already used except when queuing the timer in schedule_timeout(). But there exists the appropriate interface add_timer() which does the same job with an extra check for an already pending timer. Split all those functions as they are into a separate file and use add_timer() instead of __mod_timer() in schedule_timeout(). While at it fix minor formatting issues and a multi line printk function call in schedule_timeout(). Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-2-dc8b907cb62f@linutronix.de --- MAINTAINERS | 1 + kernel/time/Makefile | 2 +- kernel/time/hrtimer.c | 120 -------------- kernel/time/sleep_timeout.c | 317 ++++++++++++++++++++++++++++++++++++ kernel/time/timer.c | 192 ---------------------- 5 files changed, 319 insertions(+), 313 deletions(-) create mode 100644 kernel/time/sleep_timeout.c diff --git a/MAINTAINERS b/MAINTAINERS index b52362566629d6..2250eb10ece1d7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10167,6 +10167,7 @@ F: include/linux/hrtimer.h F: include/linux/timer.h F: kernel/time/clockevents.c F: kernel/time/hrtimer.c +F: kernel/time/sleep_timeout.c F: kernel/time/timer.c F: kernel/time/timer_list.c F: kernel/time/timer_migration.* diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 4af2a264a16073..fe0ae82124fe7f 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += time.o timer.o hrtimer.o +obj-y += time.o timer.o hrtimer.o sleep_timeout.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o alarmtimer.o diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index cddcd08ea827f9..04f7d8a392c397 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2242,123 +2242,3 @@ void __init hrtimers_init(void) hrtimers_prepare_cpu(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } - -/** - * schedule_hrtimeout_range_clock - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode - * @clock_id: timer clock to be used - */ -int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode, clockid_t clock_id) -{ - struct hrtimer_sleeper t; - - /* - * Optimize when a zero timeout value is given. It does not - * matter whether this is an absolute or a relative time. - */ - if (expires && *expires == 0) { - __set_current_state(TASK_RUNNING); - return 0; - } - - /* - * A NULL parameter means "infinite" - */ - if (!expires) { - schedule(); - return -EINTR; - } - - hrtimer_init_sleeper_on_stack(&t, clock_id, mode); - hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - hrtimer_sleeper_start_expires(&t, mode); - - if (likely(t.task)) - schedule(); - - hrtimer_cancel(&t.timer); - destroy_hrtimer_on_stack(&t.timer); - - __set_current_state(TASK_RUNNING); - - return !t.task ? 0 : -EINTR; -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); - -/** - * schedule_hrtimeout_range - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly - * for regular (non RT/DL) tasks. - * The kernel give the normal best effort behavior for "@expires+@delta", - * but may decide to fire the timer earlier, but no earlier than @expires. - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. - */ -int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range_clock(expires, delta, mode, - CLOCK_MONOTONIC); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); - -/** - * schedule_hrtimeout - sleep until timeout - * @expires: timeout value (ktime_t) - * @mode: timer mode - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. - */ -int __sched schedule_hrtimeout(ktime_t *expires, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range(expires, 0, mode); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout); diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c new file mode 100644 index 00000000000000..78b2e7e30b1e0f --- /dev/null +++ b/kernel/time/sleep_timeout.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Kernel internal schedule timeout and sleeping functions + */ + +#include +#include +#include +#include +#include + +#include "tick-internal.h" + +/* + * Since schedule_timeout()'s timer is defined on the stack, it must store + * the target task on the stack as well. + */ +struct process_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void process_timeout(struct timer_list *t) +{ + struct process_timer *timeout = from_timer(timeout, t, timer); + + wake_up_process(timeout->task); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have elapsed. + * The function behavior depends on the current task state + * (see also set_current_state() description): + * + * %TASK_RUNNING - the scheduler is called, but the task does not sleep + * at all. That happens because sched_submit_work() does nothing for + * tasks in %TASK_RUNNING state. + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be %TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * Returns: 0 when the timer has expired otherwise the remaining time in + * jiffies will be returned. In all cases the return value is guaranteed + * to be non-negative. + */ +signed long __sched schedule_timeout(signed long timeout) +{ + struct process_timer timer; + unsigned long expire; + + switch (timeout) { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + pr_err("%s: wrong timeout value %lx\n", __func__, timeout); + dump_stack(); + __set_current_state(TASK_RUNNING); + goto out; + } + } + + expire = timeout + jiffies; + + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + timer.timer.expires = expire; + add_timer(&timer.timer); + schedule(); + del_timer_sync(&timer.timer); + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} +EXPORT_SYMBOL(schedule_timeout); + +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_killable(signed long timeout) +{ + __set_current_state(TASK_KILLABLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_killable); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + +/* + * Like schedule_timeout_uninterruptible(), except this task will not contribute + * to load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + +/** + * schedule_hrtimeout_range_clock - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode + * @clock_id: timer clock to be used + */ +int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode, clockid_t clock_id) +{ + struct hrtimer_sleeper t; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && *expires == 0) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "infinite" + */ + if (!expires) { + schedule(); + return -EINTR; + } + + hrtimer_init_sleeper_on_stack(&t, clock_id, mode); + hrtimer_set_expires_range_ns(&t.timer, *expires, delta); + hrtimer_sleeper_start_expires(&t, mode); + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_current_state(TASK_RUNNING); + + return !t.task ? 0 : -EINTR; +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); + +/** + * schedule_hrtimeout_range - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly + * for regular (non RT/DL) tasks. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns: 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. + */ +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range_clock(expires, delta, mode, + CLOCK_MONOTONIC); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); + +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns: 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. + */ +int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range(expires, 0, mode); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout); + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs); + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +} +EXPORT_SYMBOL(msleep); + +/** + * msleep_interruptible - sleep waiting for signals + * @msecs: Time in milliseconds to sleep for + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs); + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); + return jiffies_to_msecs(timeout); +} +EXPORT_SYMBOL(msleep_interruptible); + +/** + * usleep_range_state - Sleep for an approximate time in a given state + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * @state: State of the current task that will be while sleeping + * + * In non-atomic context where the exact wakeup time is flexible, use + * usleep_range_state() instead of udelay(). The sleep improves responsiveness + * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces + * power usage by allowing hrtimers to take advantage of an already- + * scheduled interrupt instead of scheduling a new one just for this sleep. + */ +void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) +{ + ktime_t exp = ktime_add_us(ktime_get(), min); + u64 delta = (u64)(max - min) * NSEC_PER_USEC; + + for (;;) { + __set_current_state(state); + /* Do not return before the requested sleep time has elapsed */ + if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) + break; + } +} +EXPORT_SYMBOL(usleep_range_state); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0fc9d066a7be46..02355b275bab25 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -2526,141 +2525,6 @@ void update_process_times(int user_tick) run_posix_cpu_timers(); } -/* - * Since schedule_timeout()'s timer is defined on the stack, it must store - * the target task on the stack as well. - */ -struct process_timer { - struct timer_list timer; - struct task_struct *task; -}; - -static void process_timeout(struct timer_list *t) -{ - struct process_timer *timeout = from_timer(timeout, t, timer); - - wake_up_process(timeout->task); -} - -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have elapsed. - * The function behavior depends on the current task state - * (see also set_current_state() description): - * - * %TASK_RUNNING - the scheduler is called, but the task does not sleep - * at all. That happens because sched_submit_work() does nothing for - * tasks in %TASK_RUNNING state. - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be %TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. - * - * Returns 0 when the timer has expired otherwise the remaining time in - * jiffies will be returned. In all cases the return value is guaranteed - * to be non-negative. - */ -signed long __sched schedule_timeout(signed long timeout) -{ - struct process_timer timer; - unsigned long expire; - - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx\n", timeout); - dump_stack(); - __set_current_state(TASK_RUNNING); - goto out; - } - } - - expire = timeout + jiffies; - - timer.task = current; - timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); - schedule(); - del_timer_sync(&timer.timer); - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); - - timeout = expire - jiffies; - - out: - return timeout < 0 ? 0 : timeout; -} -EXPORT_SYMBOL(schedule_timeout); - -/* - * We can use __set_current_state() here because schedule_timeout() calls - * schedule() unconditionally. - */ -signed long __sched schedule_timeout_interruptible(signed long timeout) -{ - __set_current_state(TASK_INTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_interruptible); - -signed long __sched schedule_timeout_killable(signed long timeout) -{ - __set_current_state(TASK_KILLABLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_killable); - -signed long __sched schedule_timeout_uninterruptible(signed long timeout) -{ - __set_current_state(TASK_UNINTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_uninterruptible); - -/* - * Like schedule_timeout_uninterruptible(), except this task will not contribute - * to load average. - */ -signed long __sched schedule_timeout_idle(signed long timeout) -{ - __set_current_state(TASK_IDLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_idle); - #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { @@ -2757,59 +2621,3 @@ void __init init_timers(void) posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } - -/** - * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for - */ -void msleep(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs); - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -} - -EXPORT_SYMBOL(msleep); - -/** - * msleep_interruptible - sleep waiting for signals - * @msecs: Time in milliseconds to sleep for - */ -unsigned long msleep_interruptible(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs); - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); - return jiffies_to_msecs(timeout); -} - -EXPORT_SYMBOL(msleep_interruptible); - -/** - * usleep_range_state - Sleep for an approximate time in a given state - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep - * @state: State of the current task that will be while sleeping - * - * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range_state() instead of udelay(). The sleep improves responsiveness - * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces - * power usage by allowing hrtimers to take advantage of an already- - * scheduled interrupt instead of scheduling a new one just for this sleep. - */ -void __sched usleep_range_state(unsigned long min, unsigned long max, - unsigned int state) -{ - ktime_t exp = ktime_add_us(ktime_get(), min); - u64 delta = (u64)(max - min) * NSEC_PER_USEC; - - for (;;) { - __set_current_state(state); - /* Do not return before the requested sleep time has elapsed */ - if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) - break; - } -} -EXPORT_SYMBOL(usleep_range_state); From cf5b6ef0c36be3489972966b8a18aa5c48559661 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:20 +0200 Subject: [PATCH 030/140] timers: Update schedule_[hr]timeout*() related function descriptions schedule_timeout*() functions do not have proper kernel-doc formatted function descriptions. schedule_hrtimeout() and schedule_hrtimeout_range() have a almost identical description. Add missing function descriptions. Remove copy of function description and add a pointer to the existing description instead. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-3-dc8b907cb62f@linutronix.de --- kernel/time/sleep_timeout.c | 66 +++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 25 deletions(-) diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index 78b2e7e30b1e0f..560d17c30aa5d9 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -110,8 +110,17 @@ signed long __sched schedule_timeout(signed long timeout) EXPORT_SYMBOL(schedule_timeout); /* - * We can use __set_current_state() here because schedule_timeout() calls - * schedule() unconditionally. + * __set_current_state() can be used in schedule_timeout_*() functions, because + * schedule_timeout() calls schedule() unconditionally. + */ + +/** + * schedule_timeout_interruptible - sleep until timeout (interruptible) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_INTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_interruptible(signed long timeout) { @@ -120,6 +129,14 @@ signed long __sched schedule_timeout_interruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_interruptible); +/** + * schedule_timeout_killable - sleep until timeout (killable) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_KILLABLE before starting the timeout. + */ signed long __sched schedule_timeout_killable(signed long timeout) { __set_current_state(TASK_KILLABLE); @@ -127,6 +144,14 @@ signed long __sched schedule_timeout_killable(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_killable); +/** + * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout. + */ signed long __sched schedule_timeout_uninterruptible(signed long timeout) { __set_current_state(TASK_UNINTERRUPTIBLE); @@ -134,9 +159,15 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -/* - * Like schedule_timeout_uninterruptible(), except this task will not contribute - * to load average. +/** + * schedule_timeout_idle - sleep until timeout (idle) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_IDLE before starting the timeout. It is similar to + * schedule_timeout_uninterruptible(), except this task will not contribute to + * load average. */ signed long __sched schedule_timeout_idle(signed long timeout) { @@ -151,6 +182,9 @@ EXPORT_SYMBOL(schedule_timeout_idle); * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used + * + * Details are explained in schedule_hrtimeout_range() function description as + * this function is commonly used. */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id) @@ -236,26 +270,8 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); * @expires: timeout value (ktime_t) * @mode: timer mode * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns: 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. + * See schedule_hrtimeout_range() for details. @delta argument of + * schedule_hrtimeout_range() is set to 0 and has therefore no impact. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) { From 102f085d84607462234ac60f6027973b45a9bde2 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:21 +0200 Subject: [PATCH 031/140] timers: Rename usleep_idle_range() to usleep_range_idle() usleep_idle_range() is a variant of usleep_range(). Both are using usleep_range_state() as a base. To be able to find all the related functions in one go, rename it usleep_idle_range() to usleep_range_idle(). No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: SeongJae Park Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-4-dc8b907cb62f@linutronix.de --- include/linux/delay.h | 2 +- mm/damon/core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/delay.h b/include/linux/delay.h index ff9cda975e30d7..2bc586aa206849 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -68,7 +68,7 @@ static inline void usleep_range(unsigned long min, unsigned long max) usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); } -static inline void usleep_idle_range(unsigned long min, unsigned long max) +static inline void usleep_range_idle(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_IDLE); } diff --git a/mm/damon/core.c b/mm/damon/core.c index a83f3b736d51e6..c725c78b43f019 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1896,7 +1896,7 @@ static void kdamond_usleep(unsigned long usecs) if (usecs > 20 * USEC_PER_MSEC) schedule_timeout_idle(usecs_to_jiffies(usecs)); else - usleep_idle_range(usecs, usecs + 1); + usleep_range_idle(usecs, usecs + 1); } /* Returns negative error code if it's not activated but should return */ From f36eb171410839325fff9cd9b7b7400f7e606962 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:22 +0200 Subject: [PATCH 032/140] timers: Update function descriptions of sleep/delay related functions A lot of commonly used functions for inserting a sleep or delay lack a proper function description. Add function descriptions to all of them to have important information in a central place close to the code. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-5-dc8b907cb62f@linutronix.de --- include/asm-generic/delay.h | 41 +++++++++++++++++++++++++--- include/linux/delay.h | 48 ++++++++++++++++++++++++--------- kernel/time/sleep_timeout.c | 53 ++++++++++++++++++++++++++++++++----- 3 files changed, 120 insertions(+), 22 deletions(-) diff --git a/include/asm-generic/delay.h b/include/asm-generic/delay.h index e448ac61430cae..a8cee41cc51b68 100644 --- a/include/asm-generic/delay.h +++ b/include/asm-generic/delay.h @@ -12,11 +12,39 @@ extern void __const_udelay(unsigned long xloops); extern void __delay(unsigned long loops); /* - * The weird n/20000 thing suppresses a "comparison is always false due to - * limited range of data type" warning with non-const 8-bit arguments. + * Implementation details: + * + * * The weird n/20000 thing suppresses a "comparison is always false due to + * limited range of data type" warning with non-const 8-bit arguments. + * * 0x10c7 is 2**32 / 1000000 (rounded up) -> udelay + * * 0x5 is 2**32 / 1000000000 (rounded up) -> ndelay */ -/* 0x10c7 is 2**32 / 1000000 (rounded up) */ +/** + * udelay - Inserting a delay based on microseconds with busy waiting + * @usec: requested delay in microseconds + * + * When delaying in an atomic context ndelay(), udelay() and mdelay() are the + * only valid variants of delaying/sleeping to go with. + * + * When inserting delays in non atomic context which are shorter than the time + * which is required to queue e.g. an hrtimer and to enter then the scheduler, + * it is also valuable to use udelay(). But it is not simple to specify a + * generic threshold for this which will fit for all systems. An approximation + * is a threshold for all delays up to 10 microseconds. + * + * When having a delay which is larger than the architecture specific + * %MAX_UDELAY_MS value, please make sure mdelay() is used. Otherwise a overflow + * risk is given. + * + * Please note that ndelay(), udelay() and mdelay() may return early for several + * reasons (https://lists.openwall.net/linux-kernel/2011/01/09/56): + * + * #. computed loops_per_jiffy too low (due to the time taken to execute the + * timer interrupt.) + * #. cache behaviour affecting the time it takes to execute the loop function. + * #. CPU clock rate changes. + */ #define udelay(n) \ ({ \ if (__builtin_constant_p(n)) { \ @@ -29,7 +57,12 @@ extern void __delay(unsigned long loops); } \ }) -/* 0x5 is 2**32 / 1000000000 (rounded up) */ +/** + * ndelay - Inserting a delay based on nanoseconds with busy waiting + * @nsec: requested delay in nanoseconds + * + * See udelay() for basic information about ndelay() and it's variants. + */ #define ndelay(n) \ ({ \ if (__builtin_constant_p(n)) { \ diff --git a/include/linux/delay.h b/include/linux/delay.h index 2bc586aa206849..2de509e4adce07 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -6,17 +6,7 @@ * Copyright (C) 1993 Linus Torvalds * * Delay routines, using a pre-computed "loops_per_jiffy" value. - * - * Please note that ndelay(), udelay() and mdelay() may return early for - * several reasons: - * 1. computed loops_per_jiffy too low (due to the time taken to - * execute the timer interrupt.) - * 2. cache behaviour affecting the time it takes to execute the - * loop function. - * 3. CPU clock rate changes. - * - * Please see this thread: - * https://lists.openwall.net/linux-kernel/2011/01/09/56 + * Sleep routines using timer list timers or hrtimers. */ #include @@ -35,12 +25,21 @@ extern unsigned long loops_per_jiffy; * The 2nd mdelay() definition ensures GCC will optimize away the * while loop for the common cases where n <= MAX_UDELAY_MS -- Paul G. */ - #ifndef MAX_UDELAY_MS #define MAX_UDELAY_MS 5 #endif #ifndef mdelay +/** + * mdelay - Inserting a delay based on milliseconds with busy waiting + * @n: requested delay in milliseconds + * + * See udelay() for basic information about mdelay() and it's variants. + * + * Please double check, whether mdelay() is the right way to go or whether a + * refactoring of the code is the better variant to be able to use msleep() + * instead. + */ #define mdelay(n) (\ (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \ ({unsigned long __ms=(n); while (__ms--) udelay(1000);})) @@ -63,16 +62,41 @@ unsigned long msleep_interruptible(unsigned int msecs); void usleep_range_state(unsigned long min, unsigned long max, unsigned int state); +/** + * usleep_range - Sleep for an approximate time + * @min: Minimum time in microseconds to sleep + * @max: Maximum time in microseconds to sleep + * + * For basic information please refere to usleep_range_state(). + * + * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep. + */ static inline void usleep_range(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); } +/** + * usleep_range_idle - Sleep for an approximate time with idle time accounting + * @min: Minimum time in microseconds to sleep + * @max: Maximum time in microseconds to sleep + * + * For basic information please refere to usleep_range_state(). + * + * The sleeping task has the state TASK_IDLE during the sleep to prevent + * contribution to the load avarage. + */ static inline void usleep_range_idle(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_IDLE); } +/** + * ssleep - wrapper for seconds around msleep + * @seconds: Requested sleep duration in seconds + * + * Please refere to msleep() for detailed information. + */ static inline void ssleep(unsigned int seconds) { msleep(seconds * 1000); diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index 560d17c30aa5d9..f3f246e4c8d1a5 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -281,7 +281,34 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout); /** * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for + * @msecs: Requested sleep duration in milliseconds + * + * msleep() uses jiffy based timeouts for the sleep duration. Because of the + * design of the timer wheel, the maximum additional percentage delay (slack) is + * 12.5%. This is only valid for timers which will end up in level 1 or a higher + * level of the timer wheel. For explanation of those 12.5% please check the + * detailed description about the basics of the timer wheel. + * + * The slack of timers which will end up in level 0 depends on sleep duration + * (msecs) and HZ configuration and can be calculated in the following way (with + * the timer wheel design restriction that the slack is not less than 12.5%): + * + * ``slack = MSECS_PER_TICK / msecs`` + * + * When the allowed slack of the callsite is known, the calculation could be + * turned around to find the minimal allowed sleep duration to meet the + * constraints. For example: + * + * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``: + * all sleep durations greater or equal 4ms will meet the constraints. + * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``: + * all sleep durations greater or equal 8ms will meet the constraints. + * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``: + * all sleep durations greater or equal 16ms will meet the constraints. + * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``: + * all sleep durations greater or equal 32ms will meet the constraints. + * + * See also the signal aware variant msleep_interruptible(). */ void msleep(unsigned int msecs) { @@ -294,7 +321,15 @@ EXPORT_SYMBOL(msleep); /** * msleep_interruptible - sleep waiting for signals - * @msecs: Time in milliseconds to sleep for + * @msecs: Requested sleep duration in milliseconds + * + * See msleep() for some basic information. + * + * The difference between msleep() and msleep_interruptible() is that the sleep + * could be interrupted by a signal delivery and then returns early. + * + * Returns: The remaining time of the sleep duration transformed to msecs (see + * schedule_timeout() for details). */ unsigned long msleep_interruptible(unsigned int msecs) { @@ -312,11 +347,17 @@ EXPORT_SYMBOL(msleep_interruptible); * @max: Maximum time in usecs to sleep * @state: State of the current task that will be while sleeping * + * usleep_range_state() sleeps at least for the minimum specified time but not + * longer than the maximum specified amount of time. The range might reduce + * power usage by allowing hrtimers to coalesce an already scheduled interrupt + * with this hrtimer. In the worst case, an interrupt is scheduled for the upper + * bound. + * + * The sleeping task is set to the specified state before starting the sleep. + * * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range_state() instead of udelay(). The sleep improves responsiveness - * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces - * power usage by allowing hrtimers to take advantage of an already- - * scheduled interrupt instead of scheduling a new one just for this sleep. + * usleep_range() or its variants instead of udelay(). The sleep improves + * responsiveness by avoiding the CPU-hogging busy-wait of udelay(). */ void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) { From 19e2d91d8cb1f333adf04731f2788ff6ca06cebd Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:23 +0200 Subject: [PATCH 033/140] delay: Rework udelay and ndelay udelay() as well as ndelay() are defines and no functions and are using constants to be able to transform a sleep time into loops and to prevent too long udelays/ndelays. There was a compiler error with non-const 8 bit arguments which was fixed by commit a87e553fabe8 ("asm-generic: delay.h fix udelay and ndelay for 8 bit args"). When using a function, the non-const 8 bit argument is type casted and the problem would be gone. Transform udelay() and ndelay() into proper functions, remove the no longer and confusing division, add defines for the magic values and add some explanations as well. Suggested-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-6-dc8b907cb62f@linutronix.de --- include/asm-generic/delay.h | 65 +++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/include/asm-generic/delay.h b/include/asm-generic/delay.h index a8cee41cc51b68..76cf237b6e4cb5 100644 --- a/include/asm-generic/delay.h +++ b/include/asm-generic/delay.h @@ -2,6 +2,9 @@ #ifndef __ASM_GENERIC_DELAY_H #define __ASM_GENERIC_DELAY_H +#include +#include + /* Undefined functions to get compile-time errors */ extern void __bad_udelay(void); extern void __bad_ndelay(void); @@ -12,13 +15,18 @@ extern void __const_udelay(unsigned long xloops); extern void __delay(unsigned long loops); /* - * Implementation details: - * - * * The weird n/20000 thing suppresses a "comparison is always false due to - * limited range of data type" warning with non-const 8-bit arguments. - * * 0x10c7 is 2**32 / 1000000 (rounded up) -> udelay - * * 0x5 is 2**32 / 1000000000 (rounded up) -> ndelay + * The microseconds/nanosecond delay multiplicators are used to convert a + * constant microseconds/nanoseconds value to a value which can be used by the + * architectures specific implementation to transform it into loops. + */ +#define UDELAY_CONST_MULT ((unsigned long)DIV_ROUND_UP(1ULL << 32, USEC_PER_SEC)) +#define NDELAY_CONST_MULT ((unsigned long)DIV_ROUND_UP(1ULL << 32, NSEC_PER_SEC)) + +/* + * The maximum constant udelay/ndelay value picked out of thin air to prevent + * too long constant udelays/ndelays. */ +#define DELAY_CONST_MAX 20000 /** * udelay - Inserting a delay based on microseconds with busy waiting @@ -45,17 +53,17 @@ extern void __delay(unsigned long loops); * #. cache behaviour affecting the time it takes to execute the loop function. * #. CPU clock rate changes. */ -#define udelay(n) \ - ({ \ - if (__builtin_constant_p(n)) { \ - if ((n) / 20000 >= 1) \ - __bad_udelay(); \ - else \ - __const_udelay((n) * 0x10c7ul); \ - } else { \ - __udelay(n); \ - } \ - }) +static __always_inline void udelay(unsigned long usec) +{ + if (__builtin_constant_p(usec)) { + if (usec >= DELAY_CONST_MAX) + __bad_udelay(); + else + __const_udelay(usec * UDELAY_CONST_MULT); + } else { + __udelay(usec); + } +} /** * ndelay - Inserting a delay based on nanoseconds with busy waiting @@ -63,16 +71,17 @@ extern void __delay(unsigned long loops); * * See udelay() for basic information about ndelay() and it's variants. */ -#define ndelay(n) \ - ({ \ - if (__builtin_constant_p(n)) { \ - if ((n) / 20000 >= 1) \ - __bad_ndelay(); \ - else \ - __const_udelay((n) * 5ul); \ - } else { \ - __ndelay(n); \ - } \ - }) +static __always_inline void ndelay(unsigned long nsec) +{ + if (__builtin_constant_p(nsec)) { + if (nsec >= DELAY_CONST_MAX) + __bad_udelay(); + else + __const_udelay(nsec * NDELAY_CONST_MULT); + } else { + __udelay(nsec); + } +} +#define ndelay(x) ndelay(x) #endif /* __ASM_GENERIC_DELAY_H */ From 82e11e47c1880362e05c065bef7dbe28a749555c Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:24 +0200 Subject: [PATCH 034/140] timers: Adjust flseep() to reflect reality fsleep() simply implements the recommendations of the outdated documentation in "Documentation/timers/timers-howto.rst". This should be a user friendly interface to choose always the best timeout function approach: - udelay() for very short sleep durations shorter than 10 microseconds - usleep_range() for sleep durations until 20 milliseconds - msleep() for the others The actual implementation has several problems: - It does not take into account that HZ resolution also has an impact on granularity of jiffies and has also an impact on the granularity of the buckets of timer wheel levels. This means that accuracy for the timeout does not have an upper limit. When executing fsleep(20000) on a HZ=100 system, the possible additional slack will be 50% as the granularity of the buckets in the lowest level is 10 milliseconds. - The upper limit of usleep_range() is twice the requested timeout. When no other interrupts occur in this range, the maximum value is used. This means that the requested sleep length has then an additional delay of 100%. Change the thresholds for the decisions in fsleep() to make sure the maximum slack which is added to the sleep duration is 25%. Note: Outdated documentation will be updated in a followup patch. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-7-dc8b907cb62f@linutronix.de --- include/linux/delay.h | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/include/linux/delay.h b/include/linux/delay.h index 2de509e4adce07..89866bab100dbb 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -11,6 +11,7 @@ #include #include +#include extern unsigned long loops_per_jiffy; @@ -102,15 +103,35 @@ static inline void ssleep(unsigned int seconds) msleep(seconds * 1000); } -/* see Documentation/timers/timers-howto.rst for the thresholds */ +static const unsigned int max_slack_shift = 2; +#define USLEEP_RANGE_UPPER_BOUND ((TICK_NSEC << max_slack_shift) / NSEC_PER_USEC) + +/** + * fsleep - flexible sleep which autoselects the best mechanism + * @usecs: requested sleep duration in microseconds + * + * flseep() selects the best mechanism that will provide maximum 25% slack + * to the requested sleep duration. Therefore it uses: + * + * * udelay() loop for sleep durations <= 10 microseconds to avoid hrtimer + * overhead for really short sleep durations. + * * usleep_range() for sleep durations which would lead with the usage of + * msleep() to a slack larger than 25%. This depends on the granularity of + * jiffies. + * * msleep() for all other sleep durations. + * + * Note: When %CONFIG_HIGH_RES_TIMERS is not set, all sleeps are processed with + * the granularity of jiffies and the slack might exceed 25% especially for + * short sleep durations. + */ static inline void fsleep(unsigned long usecs) { if (usecs <= 10) udelay(usecs); - else if (usecs <= 20000) - usleep_range(usecs, 2 * usecs); + else if (usecs < USLEEP_RANGE_UPPER_BOUND) + usleep_range(usecs, usecs + (usecs >> max_slack_shift)); else - msleep(DIV_ROUND_UP(usecs, 1000)); + msleep(DIV_ROUND_UP(usecs, USEC_PER_MSEC)); } #endif /* defined(_LINUX_DELAY_H) */ From ef0245582e5bccd8b4c480a58bd4da91ee276397 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:25 +0200 Subject: [PATCH 035/140] mm/damon/core: Use generic upper bound recommondation for usleep_range() The upper bound for usleep_range_idle() was taken from the outdated documentation. As a recommondation for the upper bound of usleep_range() depends on HZ configuration it is not possible to hard code it. Use the define "USLEEP_RANGE_UPPER_BOUND" instead. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: SeongJae Park Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-8-dc8b907cb62f@linutronix.de --- mm/damon/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index c725c78b43f019..79efd8089d6ca3 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1892,8 +1892,7 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme) static void kdamond_usleep(unsigned long usecs) { - /* See Documentation/timers/timers-howto.rst for the thresholds */ - if (usecs > 20 * USEC_PER_MSEC) + if (usecs >= USLEEP_RANGE_UPPER_BOUND) schedule_timeout_idle(usecs_to_jiffies(usecs)); else usleep_range_idle(usecs, usecs + 1); From 6279abf16a014474fba3de2e28b6ede871141cde Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:26 +0200 Subject: [PATCH 036/140] timers: Add a warning to usleep_range_state() for wrong order of arguments There is a warning in checkpatch script that triggers, when min and max arguments of usleep_range_state() are in reverse order. This check does only cover callsites which uses constants. Add this check into the code as a WARN_ON_ONCE() to also cover callsites not using constants and fix the mis-usage by resetting the delta to 0. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-9-dc8b907cb62f@linutronix.de --- kernel/time/sleep_timeout.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index f3f246e4c8d1a5..3054e5232d208f 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -364,6 +364,9 @@ void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned i ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; + if (WARN_ON_ONCE(max < min)) + delta = 0; + for (;;) { __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ From 6534086aa684248f779944a2ac9253d6d637eec6 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:27 +0200 Subject: [PATCH 037/140] checkpatch: Remove links to outdated documentation checkpatch.pl checks for several things related to sleep and delay functions. In all warnings the outdated documentation is referenced. Also in checkpatch kernel documentation the outdated documentation is referenced. Replace the links to the outdated documentation with links to the function description. Note: Update of the outdated checkpatch checks is done in a second step. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-10-dc8b907cb62f@linutronix.de --- Documentation/dev-tools/checkpatch.rst | 2 -- scripts/checkpatch.pl | 10 +++++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Documentation/dev-tools/checkpatch.rst b/Documentation/dev-tools/checkpatch.rst index a9fac978a52514..abb3ff6820766e 100644 --- a/Documentation/dev-tools/checkpatch.rst +++ b/Documentation/dev-tools/checkpatch.rst @@ -470,8 +470,6 @@ API usage usleep_range() should be preferred over udelay(). The proper way of using usleep_range() is mentioned in the kernel docs. - See: https://www.kernel.org/doc/html/latest/timers/timers-howto.html#delays-information-on-the-various-kernel-delay-sleep-mechanisms - Comments -------- diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4427572b24771d..98790fe5115d82 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6597,11 +6597,11 @@ sub process { # ignore udelay's < 10, however if (! ($delay < 10) ) { CHK("USLEEP_RANGE", - "usleep_range is preferred over udelay; see Documentation/timers/timers-howto.rst\n" . $herecurr); + "usleep_range is preferred over udelay; see function description of usleep_range() and udelay().\n" . $herecurr); } if ($delay > 2000) { WARN("LONG_UDELAY", - "long udelay - prefer mdelay; see arch/arm/include/asm/delay.h\n" . $herecurr); + "long udelay - prefer mdelay; see function description of mdelay().\n" . $herecurr); } } @@ -6609,7 +6609,7 @@ sub process { if ($line =~ /\bmsleep\s*\((\d+)\);/) { if ($1 < 20) { WARN("MSLEEP", - "msleep < 20ms can sleep for up to 20ms; see Documentation/timers/timers-howto.rst\n" . $herecurr); + "msleep < 20ms can sleep for up to 20ms; see function description of msleep().\n" . $herecurr); } } @@ -7077,11 +7077,11 @@ sub process { my $max = $7; if ($min eq $max) { WARN("USLEEP_RANGE", - "usleep_range should not use min == max args; see Documentation/timers/timers-howto.rst\n" . "$here\n$stat\n"); + "usleep_range should not use min == max args; see function description of usleep_range().\n" . "$here\n$stat\n"); } elsif ($min =~ /^\d+$/ && $max =~ /^\d+$/ && $min > $max) { WARN("USLEEP_RANGE", - "usleep_range args reversed, use min then max; see Documentation/timers/timers-howto.rst\n" . "$here\n$stat\n"); + "usleep_range args reversed, use min then max; see function description of usleep_range().\n" . "$here\n$stat\n"); } } From 89124747f096fc0fe44be0162c7b4fb3271739e8 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:29 +0200 Subject: [PATCH 038/140] iopoll/regmap/phy/snd: Fix comment referencing outdated timer documentation Function descriptions in iopoll.h, regmap.h, phy.h and sound/soc/sof/ops.h copied all the same outdated documentation about sleep/delay function limitations. In those comments, the generic (and still outdated) timer documentation file is referenced. As proper function descriptions for used delay and sleep functions are in place, simply update the descriptions to reference to them. While at it fix missing colon after "Returns" in function description and move return value description to the end of the function description. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Andrew Lunn # for phy.h Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-12-dc8b907cb62f@linutronix.de --- include/linux/iopoll.h | 52 +++++++++++++++++++++--------------------- include/linux/phy.h | 9 ++++---- include/linux/regmap.h | 38 +++++++++++++++--------------- sound/soc/sof/ops.h | 8 +++---- 4 files changed, 54 insertions(+), 53 deletions(-) diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h index 19a7b00baff435..91324c331a4b2b 100644 --- a/include/linux/iopoll.h +++ b/include/linux/iopoll.h @@ -19,19 +19,19 @@ * @op: accessor function (takes @args as its arguments) * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * @sleep_before_read: if it is true, sleep @sleep_us before read. * @args: arguments for @op poll * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @args is stored in @val. Must not - * be called from atomic context if sleep_us or timeout_us are used. - * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @args is stored in @val. Must not + * be called from atomic context if sleep_us or timeout_us are used. */ #define read_poll_timeout(op, val, cond, sleep_us, timeout_us, \ sleep_before_read, args...) \ @@ -64,22 +64,22 @@ * @op: accessor function (takes @args as its arguments) * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @delay_us: Time to udelay between reads in us (0 tight-loops). Should - * be less than ~10us since udelay is used (see - * Documentation/timers/timers-howto.rst). + * @delay_us: Time to udelay between reads in us (0 tight-loops). Please + * read udelay() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * @delay_before_read: if it is true, delay @delay_us before read. * @args: arguments for @op poll * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @args is stored in @val. - * * This macro does not rely on timekeeping. Hence it is safe to call even when * timekeeping is suspended, at the expense of an underestimation of wall clock * time, which is rather minimal with a non-zero delay_us. * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @args is stored in @val. */ #define read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, \ delay_before_read, args...) \ @@ -119,17 +119,17 @@ * @addr: Address to poll * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @addr is stored in @val. Must not - * be called from atomic context if sleep_us or timeout_us are used. - * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @addr is stored in @val. Must not + * be called from atomic context if sleep_us or timeout_us are used. */ #define readx_poll_timeout(op, addr, val, cond, sleep_us, timeout_us) \ read_poll_timeout(op, val, cond, sleep_us, timeout_us, false, addr) @@ -140,16 +140,16 @@ * @addr: Address to poll * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @delay_us: Time to udelay between reads in us (0 tight-loops). Should - * be less than ~10us since udelay is used (see - * Documentation/timers/timers-howto.rst). + * @delay_us: Time to udelay between reads in us (0 tight-loops). Please + * read udelay() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @addr is stored in @val. - * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @addr is stored in @val. */ #define readx_poll_timeout_atomic(op, addr, val, cond, delay_us, timeout_us) \ read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, false, addr) diff --git a/include/linux/phy.h b/include/linux/phy.h index a98bc91a0cde9c..504766d4b2d5c0 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1378,12 +1378,13 @@ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); * @regnum: The register on the MMD to read * @val: Variable to read the register into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * @sleep_before_read: if it is true, sleep @sleep_us before read. - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @args is stored in @val. Must not * be called from atomic context if sleep_us or timeout_us are used. */ diff --git a/include/linux/regmap.h b/include/linux/regmap.h index f9ccad32fc5cba..75f162b60ba1d7 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -106,17 +106,17 @@ struct reg_sequence { * @addr: Address to poll * @val: Unsigned integer variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_read + * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_read * error return value in case of a error read. In the two former cases, * the last read value at @addr is stored in @val. Must not be called * from atomic context if sleep_us or timeout_us are used. - * - * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. */ #define regmap_read_poll_timeout(map, addr, val, cond, sleep_us, timeout_us) \ ({ \ @@ -133,20 +133,20 @@ struct reg_sequence { * @addr: Address to poll * @val: Unsigned integer variable to read the value into * @cond: Break condition (usually involving @val) - * @delay_us: Time to udelay between reads in us (0 tight-loops). - * Should be less than ~10us since udelay is used - * (see Documentation/timers/timers-howto.rst). + * @delay_us: Time to udelay between reads in us (0 tight-loops). Please + * read udelay() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_read - * error return value in case of a error read. In the two former cases, - * the last read value at @addr is stored in @val. - * * This is modelled after the readx_poll_timeout_atomic macros in linux/iopoll.h. * * Note: In general regmap cannot be used in atomic context. If you want to use * this macro then first setup your regmap for atomic use (flat or no cache * and MMIO regmap). + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_read + * error return value in case of a error read. In the two former cases, + * the last read value at @addr is stored in @val. */ #define regmap_read_poll_timeout_atomic(map, addr, val, cond, delay_us, timeout_us) \ ({ \ @@ -177,17 +177,17 @@ struct reg_sequence { * @field: Regmap field to read from * @val: Unsigned integer variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_field_read + * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_field_read * error return value in case of a error read. In the two former cases, * the last read value at @addr is stored in @val. Must not be called * from atomic context if sleep_us or timeout_us are used. - * - * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. */ #define regmap_field_read_poll_timeout(field, val, cond, sleep_us, timeout_us) \ ({ \ diff --git a/sound/soc/sof/ops.h b/sound/soc/sof/ops.h index 2584621c3b2d48..d73644e85b6e5d 100644 --- a/sound/soc/sof/ops.h +++ b/sound/soc/sof/ops.h @@ -597,12 +597,12 @@ snd_sof_is_chain_dma_supported(struct snd_sof_dev *sdev, u32 dai_type) * @addr: Address to poll * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @addr is stored in @val. Must not * be called from atomic context if sleep_us or timeout_us are used. * From b7f0eb8c9bc8662ca78082e82856fcb0cf16d7c6 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:30 +0200 Subject: [PATCH 039/140] powerpc/rtas: Use fsleep() to minimize additional sleep duration When commit 38f7b7067dae ("powerpc/rtas: rtas_busy_delay() improvements") was introduced, documentation about proper usage of sleep related functions was outdated. The commit message references the usage of a HZ=100 system. When using a 20ms sleep duration on such a system and therefore using msleep(), the possible additional slack will be +10ms. When the system is configured with HZ=100 the granularity of a jiffy and of a bucket of the lowest timer wheel level is 10ms. To make sure a timer will not expire early (when queueing of the timer races with an concurrent update of jiffies), timers are always queued into the next bucket. This is the reason for the maximal possible slack of 10ms. fsleep() limits the maximal possible slack to 25% by making threshold between usleep_range() and msleep() HZ dependent. As soon as the accuracy of msleep() is sufficient, the less expensive timer list timer based sleeping function is used instead of the more expensive hrtimer based usleep_range() function. The udelay() will not be used in this specific usecase as the lowest sleep length is larger than 1 millisecond. Use fsleep() directly instead of using an own heuristic for the best sleeping mechanism to use. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Michael Ellerman (powerpc) Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-13-dc8b907cb62f@linutronix.de --- arch/powerpc/kernel/rtas.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index f7e86e09c49fa1..d31c9799cab283 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -1390,21 +1390,14 @@ bool __ref rtas_busy_delay(int status) */ ms = clamp(ms, 1U, 1000U); /* - * The delay hint is an order-of-magnitude suggestion, not - * a minimum. It is fine, possibly even advantageous, for - * us to pause for less time than hinted. For small values, - * use usleep_range() to ensure we don't sleep much longer - * than actually needed. - * - * See Documentation/timers/timers-howto.rst for - * explanation of the threshold used here. In effect we use - * usleep_range() for 9900 and 9901, msleep() for - * 9902-9905. + * The delay hint is an order-of-magnitude suggestion, not a + * minimum. It is fine, possibly even advantageous, for us to + * pause for less time than hinted. To make sure pause time will + * not be way longer than requested independent of HZ + * configuration, use fsleep(). See fsleep() for details of + * used sleeping functions. */ - if (ms <= 20) - usleep_range(ms * 100, ms * 1000); - else - msleep(ms); + fsleep(ms * 1000); break; case RTAS_BUSY: ret = true; From d2af954f225db2ccf446a4b174a5281dff171d41 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:31 +0200 Subject: [PATCH 040/140] media: anysee: Fix and remove outdated comment anysee driver was transformed to use usbv2 years ago. The comments in anysee_ctrl_msg() still are referencing the old interfaces where msleep() was used. The v2 interfaces also changed over the years and with commit 1162c7b383a6 ("[media] dvb_usb_v2: refactor dvb_usbv2_generic_rw()") the usage of msleep() was gone anyway. Remove FIXME comment and update also comment before call to dvb_usbv2_generic_rw_locked(). Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-14-dc8b907cb62f@linutronix.de --- drivers/media/usb/dvb-usb-v2/anysee.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/media/usb/dvb-usb-v2/anysee.c b/drivers/media/usb/dvb-usb-v2/anysee.c index 8699846eb4162c..bea12cdc85e8ba 100644 --- a/drivers/media/usb/dvb-usb-v2/anysee.c +++ b/drivers/media/usb/dvb-usb-v2/anysee.c @@ -46,24 +46,15 @@ static int anysee_ctrl_msg(struct dvb_usb_device *d, dev_dbg(&d->udev->dev, "%s: >>> %*ph\n", __func__, slen, state->buf); - /* We need receive one message more after dvb_usb_generic_rw due - to weird transaction flow, which is 1 x send + 2 x receive. */ + /* + * We need receive one message more after dvb_usbv2_generic_rw_locked() + * due to weird transaction flow, which is 1 x send + 2 x receive. + */ ret = dvb_usbv2_generic_rw_locked(d, state->buf, sizeof(state->buf), state->buf, sizeof(state->buf)); if (ret) goto error_unlock; - /* TODO FIXME: dvb_usb_generic_rw() fails rarely with error code -32 - * (EPIPE, Broken pipe). Function supports currently msleep() as a - * parameter but I would not like to use it, since according to - * Documentation/timers/timers-howto.rst it should not be used such - * short, under < 20ms, sleeps. Repeating failed message would be - * better choice as not to add unwanted delays... - * Fixing that correctly is one of those or both; - * 1) use repeat if possible - * 2) add suitable delay - */ - /* get answer, retry few times if error returned */ for (i = 0; i < 3; i++) { /* receive 2nd answer */ From 1f455f601e2060497f9883991e8d5e79fbc7b047 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:32 +0200 Subject: [PATCH 041/140] timers/Documentation: Cleanup delay/sleep documentation The documentation which tries to give advices how to properly inserting delays or sleeps is outdated. The file name is 'timers-howto.rst' which might be misleading as it is only about delay and sleep mechanisms and not how to use timers. Update the documentation by integrating the important parts from the related function descriptions and move it all into a self explaining file with the name "delay_sleep_functions.rst". Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-15-dc8b907cb62f@linutronix.de --- .../timers/delay_sleep_functions.rst | 121 ++++++++++++++++++ Documentation/timers/index.rst | 2 +- Documentation/timers/timers-howto.rst | 115 ----------------- 3 files changed, 122 insertions(+), 116 deletions(-) create mode 100644 Documentation/timers/delay_sleep_functions.rst delete mode 100644 Documentation/timers/timers-howto.rst diff --git a/Documentation/timers/delay_sleep_functions.rst b/Documentation/timers/delay_sleep_functions.rst new file mode 100644 index 00000000000000..49d603a3f1138d --- /dev/null +++ b/Documentation/timers/delay_sleep_functions.rst @@ -0,0 +1,121 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Delay and sleep mechanisms +========================== + +This document seeks to answer the common question: "What is the +RightWay (TM) to insert a delay?" + +This question is most often faced by driver writers who have to +deal with hardware delays and who may not be the most intimately +familiar with the inner workings of the Linux Kernel. + +The following table gives a rough overview about the existing function +'families' and their limitations. This overview table does not replace the +reading of the function description before usage! + +.. list-table:: + :widths: 20 20 20 20 20 + :header-rows: 2 + + * - + - `*delay()` + - `usleep_range*()` + - `*sleep()` + - `fsleep()` + * - + - busy-wait loop + - hrtimers based + - timer list timers based + - combines the others + * - Usage in atomic Context + - yes + - no + - no + - no + * - precise on "short intervals" + - yes + - yes + - depends + - yes + * - precise on "long intervals" + - Do not use! + - yes + - max 12.5% slack + - yes + * - interruptible variant + - no + - yes + - yes + - no + +A generic advice for non atomic contexts could be: + +#. Use `fsleep()` whenever unsure (as it combines all the advantages of the + others) +#. Use `*sleep()` whenever possible +#. Use `usleep_range*()` whenever accuracy of `*sleep()` is not sufficient +#. Use `*delay()` for very, very short delays + +Find some more detailed information about the function 'families' in the next +sections. + +`*delay()` family of functions +------------------------------ + +These functions use the jiffy estimation of clock speed and will busy wait for +enough loop cycles to achieve the desired delay. udelay() is the basic +implementation and ndelay() as well as mdelay() are variants. + +These functions are mainly used to add a delay in atomic context. Please make +sure to ask yourself before adding a delay in atomic context: Is this really +required? + +.. kernel-doc:: include/asm-generic/delay.h + :identifiers: udelay ndelay + +.. kernel-doc:: include/linux/delay.h + :identifiers: mdelay + + +`usleep_range*()` and `*sleep()` family of functions +---------------------------------------------------- + +These functions use hrtimers or timer list timers to provide the requested +sleeping duration. In order to decide which function is the right one to use, +take some basic information into account: + +#. hrtimers are more expensive as they are using an rb-tree (instead of hashing) +#. hrtimers are more expensive when the requested sleeping duration is the first + timer which means real hardware has to be programmed +#. timer list timers always provide some sort of slack as they are jiffy based + +The generic advice is repeated here: + +#. Use `fsleep()` whenever unsure (as it combines all the advantages of the + others) +#. Use `*sleep()` whenever possible +#. Use `usleep_range*()` whenever accuracy of `*sleep()` is not sufficient + +First check fsleep() function description and to learn more about accuracy, +please check msleep() function description. + + +`usleep_range*()` +~~~~~~~~~~~~~~~~~ + +.. kernel-doc:: include/linux/delay.h + :identifiers: usleep_range usleep_range_idle + +.. kernel-doc:: kernel/time/sleep_timeout.c + :identifiers: usleep_range_state + + +`*sleep()` +~~~~~~~~~~ + +.. kernel-doc:: kernel/time/sleep_timeout.c + :identifiers: msleep msleep_interruptible + +.. kernel-doc:: include/linux/delay.h + :identifiers: ssleep fsleep diff --git a/Documentation/timers/index.rst b/Documentation/timers/index.rst index 983f91f8f023e0..4e88116e4dcf73 100644 --- a/Documentation/timers/index.rst +++ b/Documentation/timers/index.rst @@ -12,7 +12,7 @@ Timers hrtimers no_hz timekeeping - timers-howto + delay_sleep_functions .. only:: subproject and html diff --git a/Documentation/timers/timers-howto.rst b/Documentation/timers/timers-howto.rst deleted file mode 100644 index ef7a4652ccc9dc..00000000000000 --- a/Documentation/timers/timers-howto.rst +++ /dev/null @@ -1,115 +0,0 @@ -=================================================================== -delays - Information on the various kernel delay / sleep mechanisms -=================================================================== - -This document seeks to answer the common question: "What is the -RightWay (TM) to insert a delay?" - -This question is most often faced by driver writers who have to -deal with hardware delays and who may not be the most intimately -familiar with the inner workings of the Linux Kernel. - - -Inserting Delays ----------------- - -The first, and most important, question you need to ask is "Is my -code in an atomic context?" This should be followed closely by "Does -it really need to delay in atomic context?" If so... - -ATOMIC CONTEXT: - You must use the `*delay` family of functions. These - functions use the jiffy estimation of clock speed - and will busy wait for enough loop cycles to achieve - the desired delay: - - ndelay(unsigned long nsecs) - udelay(unsigned long usecs) - mdelay(unsigned long msecs) - - udelay is the generally preferred API; ndelay-level - precision may not actually exist on many non-PC devices. - - mdelay is macro wrapper around udelay, to account for - possible overflow when passing large arguments to udelay. - In general, use of mdelay is discouraged and code should - be refactored to allow for the use of msleep. - -NON-ATOMIC CONTEXT: - You should use the `*sleep[_range]` family of functions. - There are a few more options here, while any of them may - work correctly, using the "right" sleep function will - help the scheduler, power management, and just make your - driver better :) - - -- Backed by busy-wait loop: - - udelay(unsigned long usecs) - - -- Backed by hrtimers: - - usleep_range(unsigned long min, unsigned long max) - - -- Backed by jiffies / legacy_timers - - msleep(unsigned long msecs) - msleep_interruptible(unsigned long msecs) - - Unlike the `*delay` family, the underlying mechanism - driving each of these calls varies, thus there are - quirks you should be aware of. - - - SLEEPING FOR "A FEW" USECS ( < ~10us? ): - * Use udelay - - - Why not usleep? - On slower systems, (embedded, OR perhaps a speed- - stepped PC!) the overhead of setting up the hrtimers - for usleep *may* not be worth it. Such an evaluation - will obviously depend on your specific situation, but - it is something to be aware of. - - SLEEPING FOR ~USECS OR SMALL MSECS ( 10us - 20ms): - * Use usleep_range - - - Why not msleep for (1ms - 20ms)? - Explained originally here: - https://lore.kernel.org/r/15327.1186166232@lwn.net - - msleep(1~20) may not do what the caller intends, and - will often sleep longer (~20 ms actual sleep for any - value given in the 1~20ms range). In many cases this - is not the desired behavior. - - - Why is there no "usleep" / What is a good range? - Since usleep_range is built on top of hrtimers, the - wakeup will be very precise (ish), thus a simple - usleep function would likely introduce a large number - of undesired interrupts. - - With the introduction of a range, the scheduler is - free to coalesce your wakeup with any other wakeup - that may have happened for other reasons, or at the - worst case, fire an interrupt for your upper bound. - - The larger a range you supply, the greater a chance - that you will not trigger an interrupt; this should - be balanced with what is an acceptable upper bound on - delay / performance for your specific code path. Exact - tolerances here are very situation specific, thus it - is left to the caller to determine a reasonable range. - - SLEEPING FOR LARGER MSECS ( 10ms+ ) - * Use msleep or possibly msleep_interruptible - - - What's the difference? - msleep sets the current task to TASK_UNINTERRUPTIBLE - whereas msleep_interruptible sets the current task to - TASK_INTERRUPTIBLE before scheduling the sleep. In - short, the difference is whether the sleep can be ended - early by a signal. In general, just use msleep unless - you know you have a need for the interruptible variant. - - FLEXIBLE SLEEPING (any delay, uninterruptible) - * Use fsleep From 2e529e637cef39057d9cf199a1ecb915d97ffcd9 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Oct 2024 22:16:58 +0200 Subject: [PATCH 042/140] posix-timers: Replace call_rcu() by kfree_rcu() for simple kmem_cache_free() callback Since SLOB was removed and since commit 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"), it is not longer necessary to use call_rcu() when the callback only performs kmem_cache_free(). Use kfree_rcu() directly. The changes were made using Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Thomas Gleixner Reviewed-by: Uladzislau Rezki (Sony) Link: https://lore.kernel.org/all/20241013201704.49576-12-Julia.Lawall@inria.fr --- kernel/time/posix-timers.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4576aaed13b23b..fc40dacabe7822 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -413,18 +413,11 @@ static struct k_itimer * alloc_posix_timer(void) return tmr; } -static void k_itimer_rcu_free(struct rcu_head *head) -{ - struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); - - kmem_cache_free(posix_timers_cache, tmr); -} - static void posix_timer_free(struct k_itimer *tmr) { put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); - call_rcu(&tmr->rcu, k_itimer_rcu_free); + kfree_rcu(tmr, rcu); } static void posix_timer_unhash_and_free(struct k_itimer *tmr) From 14f1e3b3dfc7fc8b61fcb79f956f05625af6f049 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:28:54 +0200 Subject: [PATCH 043/140] timekeeping: Read NTP tick length only once No point in reading it a second time when the comparison fails. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-1-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1427c58e980255..2bc3542f29a2ca 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2161,16 +2161,17 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { + u64 ntp_tl = ntp_tick_length(); u32 mult; /* * Determine the multiplier from the current NTP tick length. * Avoid expensive division when the tick length doesn't change. */ - if (likely(tk->ntp_tick == ntp_tick_length())) { + if (likely(tk->ntp_tick == ntp_tl)) { mult = tk->tkr_mono.mult - tk->ntp_err_mult; } else { - tk->ntp_tick = ntp_tick_length(); + tk->ntp_tick = ntp_tl; mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - tk->xtime_remainder, tk->cycle_interval); } From 886150fb4f19505b8f9d26201d7671b25c233a9f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:28:55 +0200 Subject: [PATCH 044/140] timekeeping: Don't stop time readers across hard_pps() update hard_pps() update does not modify anything which might be required by time readers so forcing readers out of the way during the update is a pointless exercise. The interaction with adjtimex() and timekeeper updates which call into the NTP code is properly serialized by timekeeper_lock. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-2-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2bc3542f29a2ca..ff98a0b54b54e8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2746,11 +2746,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - __hardpps(phase_ts, raw_ts); - - write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); From 9fe7d9a984f2309ceb9f53bc89eb4885994e5052 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:28:56 +0200 Subject: [PATCH 045/140] timekeeping: Avoid duplicate leap state update do_adjtimex() invokes tk_update_leap_state() unconditionally even when a previous invocation of timekeeping_update() already did that update. Put it into the else path which is invoked when timekeeping_update() is not called. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-3-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ff98a0b54b54e8..14aaa44104ebd5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2715,8 +2715,9 @@ int do_adjtimex(struct __kernel_timex *txc) __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); clock_set = true; + } else { + tk_update_leap_state(tk); } - tk_update_leap_state(tk); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); From 1f7226b1e70a0e2ca3b305808cc7f1ae3acbd127 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:28:57 +0200 Subject: [PATCH 046/140] timekeeping: Abort clocksource change in case of failure There is no point to go through a full timekeeping update when acquiring a module reference or enabling the new clocksource fails. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-4-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 14aaa44104ebd5..a9550f6a7f12ca 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1608,33 +1608,29 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) static int change_clocksource(void *data) { struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *new, *old = NULL; + struct clocksource *new = data, *old = NULL; unsigned long flags; - bool change = false; - - new = (struct clocksource *) data; /* - * If the cs is in module, get a module reference. Succeeds - * for built-in code (owner == NULL) as well. + * If the clocksource is in a module, get a module reference. + * Succeeds for built-in code (owner == NULL) as well. Abort if the + * reference can't be acquired. */ - if (try_module_get(new->owner)) { - if (!new->enable || new->enable(new) == 0) - change = true; - else - module_put(new->owner); + if (!try_module_get(new->owner)) + return 0; + + /* Abort if the device can't be enabled */ + if (new->enable && new->enable(new) != 0) { + module_put(new->owner); + return 0; } raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); - - if (change) { - old = tk->tkr_mono.clock; - tk_setup_internals(tk, new); - } - + old = tk->tkr_mono.clock; + tk_setup_internals(tk, new); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); @@ -1643,7 +1639,6 @@ static int change_clocksource(void *data) if (old) { if (old->disable) old->disable(old); - module_put(old->owner); } From c2a329566a3d5a638061733f232c40379235931d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:28:58 +0200 Subject: [PATCH 047/140] timekeeping: Simplify code in timekeeping_advance() timekeeping_advance() takes the timekeeper_lock and releases it before returning. When an early return is required, goto statements are used to make sure the lock is realeased properly. When the code was written the locking guard() was not yet available. Use the guard() to simplify the code and while at it cleanup ordering of function variables. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-5-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a9550f6a7f12ca..cfb718dec737dd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2307,23 +2307,22 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; - u64 offset; - int shift = 0, maxshift; unsigned int clock_set = 0; - unsigned long flags; + int shift = 0, maxshift; + u64 offset; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + guard(raw_spinlock_irqsave)(&timekeeper_lock); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) - goto out; + return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask); /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) - goto out; + return false; /* Do some additional sanity checking */ timekeeping_check_update(tk, offset); @@ -2342,8 +2341,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift, - &clock_set); + offset = logarithmic_accumulation(tk, offset, shift, &clock_set); if (offset < tk->cycle_interval< Date: Tue, 15 Oct 2024 12:08:39 +0200 Subject: [PATCH 048/140] timekeeping: Reorder struct timekeeper struct timekeeper is ordered suboptimal vs. cachelines. The layout, including the preceding seqcount (see struct tk_core in timekeeper.c) is: cacheline 0: seqcount, tkr_mono cacheline 1: tkr_raw, xtime_sec cacheline 2: ktime_sec ... tai_offset, internal variables cacheline 3: next_leap_ktime, raw_sec, internal variables cacheline 4: internal variables So any access to via ktime_get*() except for access to CLOCK_MONOTONIC_RAW will use either cachelines 0 + 1 or cachelines 0 + 2. Access to CLOCK_MONOTONIC_RAW uses cachelines 0 + 1 + 3. Reorder the members so that the result is more efficient: cacheline 0: seqcount, tkr_mono cacheline 1: xtime_sec, ktime_sec ... tai_offset cacheline 2: tkr_raw, raw_sec cacheline 3: internal variables cacheline 4: internal variables That means ktime_get*() will access cacheline 0 + 1 and CLOCK_MONOTONIC_RAW access will use cachelines 0 + 2. Update kernel-doc and fix formatting issues while at it. Also fix a typo in struct tk_read_base kernel-doc. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241015100839.12702-1-anna-maria@linutronix.de --- include/linux/timekeeper_internal.h | 106 +++++++++++++++++----------- 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 902c20ef495acb..a3b6380a777720 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -26,7 +26,7 @@ * occupies a single 64byte cache line. * * The struct is separate from struct timekeeper as it is also used - * for a fast NMI safe accessors. + * for the fast NMI safe accessors. * * @base_real is for the fast NMI safe accessor to allow reading clock * realtime from any context. @@ -44,33 +44,41 @@ struct tk_read_base { /** * struct timekeeper - Structure holding internal timekeeping values. - * @tkr_mono: The readout base structure for CLOCK_MONOTONIC - * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW - * @xtime_sec: Current CLOCK_REALTIME time in seconds - * @ktime_sec: Current CLOCK_MONOTONIC time in seconds - * @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset - * @offs_real: Offset clock monotonic -> clock realtime - * @offs_boot: Offset clock monotonic -> clock boottime - * @offs_tai: Offset clock monotonic -> clock tai - * @tai_offset: The current UTC to TAI offset in seconds - * @clock_was_set_seq: The sequence number of clock was set events - * @cs_was_changed_seq: The sequence number of clocksource change events - * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second - * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds - * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset - * @cycle_interval: Number of clock cycles in one NTP interval - * @xtime_interval: Number of clock shifted nano seconds in one NTP - * interval. - * @xtime_remainder: Shifted nano seconds left over when rounding - * @cycle_interval - * @raw_interval: Shifted raw nano seconds accumulated per NTP interval. - * @ntp_error: Difference between accumulated time and NTP time in ntp - * shifted nano seconds. - * @ntp_error_shift: Shift conversion between clock shifted nano seconds and - * ntp shifted nano seconds. - * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) - * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) - * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) + * @tkr_mono: The readout base structure for CLOCK_MONOTONIC + * @xtime_sec: Current CLOCK_REALTIME time in seconds + * @ktime_sec: Current CLOCK_MONOTONIC time in seconds + * @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset + * @offs_real: Offset clock monotonic -> clock realtime + * @offs_boot: Offset clock monotonic -> clock boottime + * @offs_tai: Offset clock monotonic -> clock tai + * @tai_offset: The current UTC to TAI offset in seconds + * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW + * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds + * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events + * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset + * @cycle_interval: Number of clock cycles in one NTP interval + * @xtime_interval: Number of clock shifted nano seconds in one NTP + * interval. + * @xtime_remainder: Shifted nano seconds left over when rounding + * @cycle_interval + * @raw_interval: Shifted raw nano seconds accumulated per NTP interval. + * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second + * @ntp_tick: The ntp_tick_length() value currently being + * used. This cached copy ensures we consistently + * apply the tick length for an entire tick, as + * ntp_tick_length may change mid-tick, and we don't + * want to apply that new value to the tick in + * progress. + * @ntp_error: Difference between accumulated time and NTP time in ntp + * shifted nano seconds. + * @ntp_error_shift: Shift conversion between clock shifted nano seconds and + * ntp shifted nano seconds. + * @ntp_err_mult: Multiplication factor for scaled math conversion + * @skip_second_overflow: Flag used to avoid updating NTP twice with same second + * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) + * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) + * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what * we need to add to xtime (or xtime corrected for sub jiffy times) @@ -88,10 +96,28 @@ struct tk_read_base { * * @monotonic_to_boottime is a timespec64 representation of @offs_boot to * accelerate the VDSO update for CLOCK_BOOTTIME. + * + * The cacheline ordering of the structure is optimized for in kernel usage of + * the ktime_get() and ktime_get_ts64() family of time accessors. Struct + * timekeeper is prepended in the core timekeeping code with a sequence count, + * which results in the following cacheline layout: + * + * 0: seqcount, tkr_mono + * 1: xtime_sec ... tai_offset + * 2: tkr_raw, raw_sec + * 3,4: Internal variables + * + * Cacheline 0,1 contain the data which is used for accessing + * CLOCK_MONOTONIC/REALTIME/BOOTTIME/TAI, while cacheline 2 contains the + * data for accessing CLOCK_MONOTONIC_RAW. Cacheline 3,4 are internal + * variables which are only accessed during timekeeper updates once per + * tick. */ struct timekeeper { + /* Cacheline 0 (together with prepended seqcount of timekeeper core): */ struct tk_read_base tkr_mono; - struct tk_read_base tkr_raw; + + /* Cacheline 1: */ u64 xtime_sec; unsigned long ktime_sec; struct timespec64 wall_to_monotonic; @@ -99,31 +125,29 @@ struct timekeeper { ktime_t offs_boot; ktime_t offs_tai; s32 tai_offset; + + /* Cacheline 2: */ + struct tk_read_base tkr_raw; + u64 raw_sec; + + /* Cachline 3 and 4 (timekeeping internal variables): */ unsigned int clock_was_set_seq; u8 cs_was_changed_seq; - ktime_t next_leap_ktime; - u64 raw_sec; + struct timespec64 monotonic_to_boot; - /* The following members are for timekeeping internal use */ u64 cycle_interval; u64 xtime_interval; s64 xtime_remainder; u64 raw_interval; - /* The ntp_tick_length() value currently being used. - * This cached copy ensures we consistently apply the tick - * length for an entire tick, as ntp_tick_length may change - * mid-tick, and we don't want to apply that new value to - * the tick in progress. - */ + + ktime_t next_leap_ktime; u64 ntp_tick; - /* Difference between accumulated time and NTP time in ntp - * shifted nano seconds. */ s64 ntp_error; u32 ntp_error_shift; u32 ntp_err_mult; - /* Flag used to avoid updating NTP twice with same second */ u32 skip_second_overflow; + #ifdef CONFIG_DEBUG_TIMEKEEPING long last_warning; /* From 20c7b582e88b8a72832637cd1754e5622aa8a92d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:29:00 +0200 Subject: [PATCH 049/140] timekeeping: Move shadow_timekeeper into tk_core tk_core requires shadow_timekeeper to allow timekeeping_advance() updating without holding the timekeeper sequence count write locked. This allows the readers to make progress up to the actual update where the shadow timekeeper is copied over to the real timekeeper. As long as there is only a single timekeeper, having them separate is fine. But when the timekeeper infrastructure will be reused for per ptp clock timekeepers, shadow_timekeeper needs to be part of tk_core. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-7-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cfb718dec737dd..848d2b18f8002a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -50,11 +50,11 @@ DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; + struct timekeeper shadow_timekeeper; } tk_core ____cacheline_aligned = { .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), }; -static struct timekeeper shadow_timekeeper; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -795,8 +795,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) * timekeeper structure on the next update with stale data */ if (action & TK_MIRROR) - memcpy(&shadow_timekeeper, &tk_core.timekeeper, - sizeof(tk_core.timekeeper)); + memcpy(&tk_core.shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); } /** @@ -2305,8 +2304,8 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, */ static bool timekeeping_advance(enum timekeeping_adv_mode mode) { + struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; - struct timekeeper *tk = &shadow_timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; u64 offset; From dbdcf8c4caeca8192daa43429ccf23a1feec126c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:29:01 +0200 Subject: [PATCH 050/140] timekeeping: Encapsulate locking/unlocking of timekeeper_lock timekeeper_lock protects updates of timekeeper (tk_core). It is also used by vdso_update_begin/end() and not only internally by the timekeeper code. As long as there is only a single timekeeper, this works fine. But when the timekeeper infrastructure will be reused for per ptp clock timekeepers, timekeeper_lock needs to be part of tk_core.. Therefore encapuslate locking/unlocking of timekeeper_lock and make the lock static. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-8-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 15 ++++++++++++++- kernel/time/timekeeping_internal.h | 3 ++- kernel/time/vsyscall.c | 5 ++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 848d2b18f8002a..77e0a0fe77717f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -41,7 +41,7 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; -DEFINE_RAW_SPINLOCK(timekeeper_lock); +static DEFINE_RAW_SPINLOCK(timekeeper_lock); /* * The most important data for readout fits into a single 64 byte @@ -114,6 +114,19 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +unsigned long timekeeper_lock_irqsave(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + return flags; +} + +void timekeeper_unlock_irqrestore(unsigned long flags) +{ + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} + /* * Multigrain timestamps require tracking the latest fine-grained timestamp * that has been issued, and never returning a coarse-grained timestamp that is diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 0bbae825bc0226..b3dca834f48ca8 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -49,6 +49,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) #endif /* Semi public for serialization of non timekeeper VDSO updates. */ -extern raw_spinlock_t timekeeper_lock; +unsigned long timekeeper_lock_irqsave(void); +void timekeeper_unlock_irqrestore(unsigned long flags); #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 9193d6133e5d68..98488b20b594e6 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -151,9 +151,8 @@ void update_vsyscall_tz(void) unsigned long vdso_update_begin(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); - unsigned long flags; + unsigned long flags = timekeeper_lock_irqsave(); - raw_spin_lock_irqsave(&timekeeper_lock, flags); vdso_write_begin(vdata); return flags; } @@ -172,5 +171,5 @@ void vdso_update_end(unsigned long flags) vdso_write_end(vdata); __arch_sync_vdso_data(vdata); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeper_unlock_irqrestore(flags); } From 8c4799b1845eabbdd820aa340f493ba8919af7a2 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:02 +0200 Subject: [PATCH 051/140] timekeeping: Move timekeeper_lock into tk_core timekeeper_lock protects updates to struct tk_core but is not part of struct tk_core. As long as there is only a single timekeeper, this is not a problem. But when the timekeeper infrastructure will be reused for per ptp clock timekeepers, timekeeper_lock needs to be part of tk_core. Move the lock into tk_core, move initialisation of the lock and sequence counter into timekeeping_init() and update all users of timekeeper_lock. As this is touching all lock sites, convert them to use: guard(raw_spinlock_irqsave)(&tk_core.lock); instead of lock/unlock functions whenever possible. Suggested-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-9-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 72 ++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 43 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 77e0a0fe77717f..5392a66e02b618 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -41,8 +41,6 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; -static DEFINE_RAW_SPINLOCK(timekeeper_lock); - /* * The most important data for readout fits into a single 64 byte * cache line. @@ -51,10 +49,8 @@ static struct { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; struct timekeeper shadow_timekeeper; -} tk_core ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), -}; - + raw_spinlock_t lock; +} tk_core ____cacheline_aligned; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -118,13 +114,13 @@ unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&tk_core.lock, flags); return flags; } void timekeeper_unlock_irqrestore(unsigned long flags) { - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); } /* @@ -216,7 +212,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if * the wrong clocksource is passed to the wrong read function. - * This isn't necessary to use when holding the timekeeper_lock or doing + * This isn't necessary to use when holding the tk_core.lock or doing * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ @@ -708,13 +704,11 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) int pvclock_gtod_register_notifier(struct notifier_block *nb) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; int ret; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + guard(raw_spinlock_irqsave)(&tk_core.lock); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); update_pvclock_gtod(tk, true); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } @@ -727,14 +721,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - - return ret; + guard(raw_spinlock_irqsave)(&tk_core.lock); + return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); @@ -782,7 +770,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } -/* must hold timekeeper_lock */ +/* must hold tk_core.lock */ static void timekeeping_update(struct timekeeper *tk, unsigned int action) { if (action & TK_CLEAR_NTP) { @@ -1491,7 +1479,7 @@ int do_settimeofday64(const struct timespec64 *ts) if (!timespec64_valid_settod(ts)) return -EINVAL; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&tk_core.lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); @@ -1511,7 +1499,7 @@ int do_settimeofday64(const struct timespec64 *ts) timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); @@ -1541,7 +1529,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts) if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&tk_core.lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); @@ -1561,7 +1549,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts) timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); @@ -1637,7 +1625,7 @@ static int change_clocksource(void *data) return 0; } - raw_spin_lock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&tk_core.lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); @@ -1646,7 +1634,7 @@ static int change_clocksource(void *data) timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); if (old) { if (old->disable) @@ -1801,7 +1789,9 @@ void __init timekeeping_init(void) struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; - unsigned long flags; + + raw_spin_lock_init(&tk_core.lock); + seqcount_raw_spinlock_init(&tk_core.seq, &tkd->lock); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && @@ -1821,7 +1811,7 @@ void __init timekeeping_init(void) */ wall_to_mono = timespec64_sub(boot_offset, wall_time); - raw_spin_lock_irqsave(&timekeeper_lock, flags); + guard(raw_spinlock_irqsave)(&tk_core.lock); write_seqcount_begin(&tk_core.seq); ntp_init(); @@ -1838,7 +1828,6 @@ void __init timekeeping_init(void) timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /* time in seconds when suspend began for persistent clock */ @@ -1919,7 +1908,7 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&tk_core.lock, flags); write_seqcount_begin(&tk_core.seq); suspend_timing_needed = false; @@ -1931,7 +1920,7 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); @@ -1955,7 +1944,7 @@ void timekeeping_resume(void) clockevents_resume(); clocksource_resume(); - raw_spin_lock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&tk_core.lock, flags); write_seqcount_begin(&tk_core.seq); /* @@ -1993,7 +1982,7 @@ void timekeeping_resume(void) timekeeping_suspended = 0; timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); touch_softlockup_watchdog(); @@ -2024,7 +2013,7 @@ int timekeeping_suspend(void) suspend_timing_needed = true; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&tk_core.lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; @@ -2063,7 +2052,7 @@ int timekeeping_suspend(void) timekeeping_update(tk, TK_MIRROR); halt_fast_timekeeper(tk); write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); tick_suspend(); clocksource_suspend(); @@ -2323,7 +2312,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) int shift = 0, maxshift; u64 offset; - guard(raw_spinlock_irqsave)(&timekeeper_lock); + guard(raw_spinlock_irqsave)(&tk_core.lock); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) @@ -2708,7 +2697,7 @@ int do_adjtimex(struct __kernel_timex *txc) ktime_get_real_ts64(&ts); add_device_randomness(&ts, sizeof(ts)); - raw_spin_lock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&tk_core.lock, flags); write_seqcount_begin(&tk_core.seq); orig_tai = tai = tk->tai_offset; @@ -2723,7 +2712,7 @@ int do_adjtimex(struct __kernel_timex *txc) } write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); audit_ntp_log(&ad); @@ -2747,11 +2736,8 @@ int do_adjtimex(struct __kernel_timex *txc) */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { - unsigned long flags; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); + guard(raw_spinlock_irqsave)(&tk_core.lock); __hardpps(phase_ts, raw_ts); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ From 10f7c178a9dad803e80bc01f47e7b30e12a78957 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:03 +0200 Subject: [PATCH 052/140] timekeeping: Define a struct type for tk_core to make it reusable The struct tk_core uses is not reusable. As long as there is only a single timekeeper, this is not a problem. But when the timekeeper infrastructure will be reused for per ptp clock timekeepers, an explicit struct type is required. Define struct tk_data as explicit struct type for tk_core. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-10-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5392a66e02b618..d520c11e912de3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -45,12 +45,14 @@ enum timekeeping_adv_mode { * The most important data for readout fits into a single 64 byte * cache line. */ -static struct { +struct tk_data { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; struct timekeeper shadow_timekeeper; raw_spinlock_t lock; -} tk_core ____cacheline_aligned; +} ____cacheline_aligned; + +static struct tk_data tk_core; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; From a5f9e4e4ef941048d1ff78cbb1ef95b20ed83802 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:04 +0200 Subject: [PATCH 053/140] timekeeping: Introduce tkd_basic_setup() to make lock and seqcount init reusable Initialization of lock and seqcount needs to be done for every instance of timekeeper struct. To be able to easily reuse it, create a separate function for it. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-11-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d520c11e912de3..cd83deafd3c2f8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1765,6 +1765,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } +static __init void tkd_basic_setup(struct tk_data *tkd) +{ + raw_spin_lock_init(&tkd->lock); + seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); +} + /* * Flag reflecting whether timekeeping_resume() has injected sleeptime. * @@ -1792,8 +1798,7 @@ void __init timekeeping_init(void) struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; - raw_spin_lock_init(&tk_core.lock); - seqcount_raw_spinlock_init(&tk_core.seq, &tkd->lock); + tkd_basic_setup(&tk_core); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && From 1d72d7b5fd53592342db9c9d7d0fde14a883c2c4 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:05 +0200 Subject: [PATCH 054/140] timekeeping: Add struct tk_data as argument to timekeeping_update() Updates of the timekeeper are done in two ways: 1. Updating timekeeper and afterwards memcpy()'ing the result into shadow_timekeeper using timekeeping_update(). Used everywhere for updates except in timekeeping_advance(); the sequence counter protected region starts before the first change to the timekeeper is done. 2. Updating shadow_timekeeper and then memcpy()'ing the result into timekeeper. Used only by in timekeeping_advance(); The seqence counter protected region is only around timekeeping_update() and the memcpy for copy from shadow to timekeeper. The second option is fast path optimized. The sequence counter protected region is as short as possible. As this behaviour is mainly documented by commit messages, but not in code, it makes the not easy timekeeping code more complicated to read. There is no reason why updates to the timekeeper can't use the optimized version everywhere. With this, the code will be cleaner, as code is reused instead of duplicated. To be able to access tk_data which contains all required information, add a pointer to tk_data as an argument to timekeeping_update(). With that convert the comment about holding the lock into a lockdep assert. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-12-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cd83deafd3c2f8..979687aa349d32 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -547,7 +547,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * timekeeping_inject_sleeptime64() * __timekeeping_inject_sleeptime(tk, delta); * timestamp(); - * timekeeping_update(tk, TK_CLEAR_NTP...); + * timekeeping_update(tkd, tk, TK_CLEAR_NTP...); * * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this @@ -772,9 +772,10 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } -/* must hold tk_core.lock */ -static void timekeeping_update(struct timekeeper *tk, unsigned int action) +static void timekeeping_update(struct tk_data *tkd, struct timekeeper *tk, unsigned int action) { + lockdep_assert_held(&tkd->lock); + if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(); @@ -1498,7 +1499,7 @@ int do_settimeofday64(const struct timespec64 *ts) tk_set_xtime(tk, ts); out: - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + timekeeping_update(&tk_core, tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&tk_core.lock, flags); @@ -1548,7 +1549,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts) tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + timekeeping_update(&tk_core, tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&tk_core.lock, flags); @@ -1633,7 +1634,7 @@ static int change_clocksource(void *data) timekeeping_forward_now(tk); old = tk->tkr_mono.clock; tk_setup_internals(tk, new); - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + timekeeping_update(&tk_core, tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&tk_core.lock, flags); @@ -1832,7 +1833,7 @@ void __init timekeeping_init(void) tk_set_wall_to_mono(tk, wall_to_mono); - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + timekeeping_update(&tk_core, tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); } @@ -1924,7 +1925,7 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + timekeeping_update(&tk_core, tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&tk_core.lock, flags); @@ -1987,7 +1988,7 @@ void timekeeping_resume(void) tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + timekeeping_update(&tk_core, tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&tk_core.lock, flags); @@ -2056,7 +2057,7 @@ int timekeeping_suspend(void) } } - timekeeping_update(tk, TK_MIRROR); + timekeeping_update(&tk_core, tk, TK_MIRROR); halt_fast_timekeeper(tk); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&tk_core.lock, flags); @@ -2374,7 +2375,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) * memcpy under the tk_core.seq against one before we start * updating. */ - timekeeping_update(tk, clock_set); + timekeeping_update(&tk_core, tk, clock_set); memcpy(real_tk, tk, sizeof(*tk)); /* The memcpy must come last. Do not put anything here! */ write_seqcount_end(&tk_core.seq); @@ -2712,7 +2713,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + timekeeping_update(&tk_core, tk, TK_MIRROR | TK_CLOCK_WAS_SET); clock_set = true; } else { tk_update_leap_state(tk); From 5aa6c43eca21a929ace6a8e31ab3520ddc50dfa9 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:06 +0200 Subject: [PATCH 055/140] timekeeping: Split out timekeeper update of timekeeping_advanced() timekeeping_advance() is the only optimized function which uses shadow_timekeeper for updating the real timekeeper to keep the sequence counter protected region as small as possible. To be able to transform timekeeper updates in other functions to use the same logic, split out functionality into a separate function timekeeper_update_staged(). While at it, document the reason why the sequence counter must be write held over the call to timekeeping_update() and the copying to the real timekeeper and why using a pointer based update is suboptimal. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-13-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 43 ++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 979687aa349d32..b3f4989173b078 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -799,7 +799,32 @@ static void timekeeping_update(struct tk_data *tkd, struct timekeeper *tk, unsig * timekeeper structure on the next update with stale data */ if (action & TK_MIRROR) - memcpy(&tk_core.shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); + memcpy(&tkd->shadow_timekeeper, tk, sizeof(*tk)); +} + +static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) +{ + /* + * Block out readers before invoking timekeeping_update() because + * that updates VDSO and other time related infrastructure. Not + * blocking the readers might let a reader see time going backwards + * when reading from the VDSO after the VDSO update and then + * reading in the kernel from the timekeeper before that got updated. + */ + write_seqcount_begin(&tkd->seq); + + timekeeping_update(tkd, &tkd->shadow_timekeeper, action); + + /* + * Update the real timekeeper. + * + * We could avoid this memcpy() by switching pointers, but that has + * the downside that the reader side does not longer benefit from + * the cacheline optimized data layout of the timekeeper and requires + * another indirection. + */ + memcpy(&tkd->timekeeper, &tkd->shadow_timekeeper, sizeof(tkd->shadow_timekeeper)); + write_seqcount_end(&tkd->seq); } /** @@ -2364,21 +2389,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) */ clock_set |= accumulate_nsecs_to_secs(tk); - write_seqcount_begin(&tk_core.seq); - /* - * Update the real timekeeper. - * - * We could avoid this memcpy by switching pointers, but that - * requires changes to all other timekeeper usage sites as - * well, i.e. move the timekeeper pointer getter into the - * spinlocked/seqcount protected sections. And we trade this - * memcpy under the tk_core.seq against one before we start - * updating. - */ - timekeeping_update(&tk_core, tk, clock_set); - memcpy(real_tk, tk, sizeof(*tk)); - /* The memcpy must come last. Do not put anything here! */ - write_seqcount_end(&tk_core.seq); + timekeeping_update_from_shadow(&tk_core, clock_set); return !!clock_set; } From 6b1ef640f4c48663777972ab0953a3eb6355ef85 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:07 +0200 Subject: [PATCH 056/140] timekeeping: Introduce combined timekeeping action flag Instead of explicitly listing all the separate timekeeping actions flags, introduce a new one which covers all actions except TK_MIRROR action. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-14-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b3f4989173b078..c30b1870fc58d7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -33,6 +33,8 @@ #define TK_MIRROR (1 << 1) #define TK_CLOCK_WAS_SET (1 << 2) +#define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET) + enum timekeeping_adv_mode { /* Update timekeeper when a tick has passed */ TK_ADV_TICK, @@ -1524,7 +1526,7 @@ int do_settimeofday64(const struct timespec64 *ts) tk_set_xtime(tk, ts); out: - timekeeping_update(&tk_core, tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + timekeeping_update(&tk_core, tk, TK_UPDATE_ALL | TK_MIRROR); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&tk_core.lock, flags); @@ -1574,7 +1576,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts) tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(&tk_core, tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + timekeeping_update(&tk_core, tk, TK_UPDATE_ALL | TK_MIRROR); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&tk_core.lock, flags); @@ -1659,7 +1661,7 @@ static int change_clocksource(void *data) timekeeping_forward_now(tk); old = tk->tkr_mono.clock; tk_setup_internals(tk, new); - timekeeping_update(&tk_core, tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + timekeeping_update(&tk_core, tk, TK_UPDATE_ALL | TK_MIRROR); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&tk_core.lock, flags); @@ -1950,7 +1952,7 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(&tk_core, tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + timekeeping_update(&tk_core, tk, TK_UPDATE_ALL | TK_MIRROR); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&tk_core.lock, flags); From 97e53792538dd8993172e231f09dadee57f66d69 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Oct 2024 10:29:08 +0200 Subject: [PATCH 057/140] timekeeping: Provide timekeeping_restore_shadow() Functions which operate on the real timekeeper, e.g. do_settimeofday(), have error conditions. If they are hit a full timekeeping update is still required because the already committed operations modified the timekeeper. When switching these functions to operate on the shadow timekeeper then the full update can be avoided in the error case, but the modified shadow timekeeper has to be restored. Provide a helper function for that. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-15-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c30b1870fc58d7..ed0e328fedd023 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -774,6 +774,15 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } +/* + * Restore the shadow timekeeper from the real timekeeper. + */ +static void timekeeping_restore_shadow(struct tk_data *tkd) +{ + lockdep_assert_held(&tkd->lock); + memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper)); +} + static void timekeeping_update(struct tk_data *tkd, struct timekeeper *tk, unsigned int action) { lockdep_assert_held(&tkd->lock); @@ -801,7 +810,7 @@ static void timekeeping_update(struct tk_data *tkd, struct timekeeper *tk, unsig * timekeeper structure on the next update with stale data */ if (action & TK_MIRROR) - memcpy(&tkd->shadow_timekeeper, tk, sizeof(*tk)); + timekeeping_restore_shadow(tkd); } static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) From bba9898ef399667b2afe5f79407f1595157c1374 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:09 +0200 Subject: [PATCH 058/140] timekeeping: Rework do_settimeofday64() to use shadow_timekeeper Updates of the timekeeper can be done by operating on the shadow timekeeper and afterwards copying the result into the real timekeeper. This has the advantage, that the sequence count write protected region is kept as small as possible. Convert do_settimeofday64() to use this scheme. That allows to use a scoped_guard() for locking the timekeeper lock as the usage of the shadow timekeeper allows a rollback in the error case instead of the full timekeeper update of the original code. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-16-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 42 +++++++++++++++------------------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ed0e328fedd023..1b8db11916148b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1510,45 +1510,35 @@ EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); */ int do_settimeofday64(const struct timespec64 *ts) { - struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts_delta, xt; - unsigned long flags; - int ret = 0; if (!timespec64_valid_settod(ts)) return -EINVAL; - raw_spin_lock_irqsave(&tk_core.lock, flags); - write_seqcount_begin(&tk_core.seq); - - timekeeping_forward_now(tk); - - xt = tk_xtime(tk); - ts_delta = timespec64_sub(*ts, xt); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { - ret = -EINVAL; - goto out; - } + timekeeping_forward_now(tks); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); + xt = tk_xtime(tks); + ts_delta = timespec64_sub(*ts, xt); - tk_set_xtime(tk, ts); -out: - timekeeping_update(&tk_core, tk, TK_UPDATE_ALL | TK_MIRROR); + if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) { + timekeeping_restore_shadow(&tk_core); + return -EINVAL; + } - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&tk_core.lock, flags); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta)); + tk_set_xtime(tks, ts); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); - if (!ret) { - audit_tk_injoffset(ts_delta); - add_device_randomness(ts, sizeof(*ts)); - } - - return ret; + audit_tk_injoffset(ts_delta); + add_device_randomness(ts, sizeof(*ts)); + return 0; } EXPORT_SYMBOL(do_settimeofday64); From 82214756d35f48056fe36aa4d95a22e44a3b2619 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:10 +0200 Subject: [PATCH 059/140] timekeeping: Rework timekeeping_inject_offset() to use shadow_timekeeper Updates of the timekeeper can be done by operating on the shadow timekeeper and afterwards copying the result into the real timekeeper. This has the advantage, that the sequence count write protected region is kept as small as possible. Convert timekeeping_inject_offset() to use this scheme. That allows to use a scoped_guard() for locking the timekeeper lock as the usage of the shadow timekeeper allows a rollback in the error case instead of the full timekeeper update of the original code. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-17-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 41 +++++++++++++++------------------------ 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1b8db11916148b..7e865f05793596 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1550,40 +1550,31 @@ EXPORT_SYMBOL(do_settimeofday64); */ static int timekeeping_inject_offset(const struct timespec64 *ts) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - struct timespec64 tmp; - int ret = 0; - if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - raw_spin_lock_irqsave(&tk_core.lock, flags); - write_seqcount_begin(&tk_core.seq); - - timekeeping_forward_now(tk); - - /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tk), *ts); - if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || - !timespec64_valid_settod(&tmp)) { - ret = -EINVAL; - goto error; - } + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct timespec64 tmp; - tk_xtime_add(tk, ts); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); + timekeeping_forward_now(tks); -error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(&tk_core, tk, TK_UPDATE_ALL | TK_MIRROR); + /* Make sure the proposed value is valid */ + tmp = timespec64_add(tk_xtime(tks), *ts); + if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || + !timespec64_valid_settod(&tmp)) { + timekeeping_restore_shadow(&tk_core); + return -EINVAL; + } - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&tk_core.lock, flags); + tk_xtime_add(tks, ts); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); - - return ret; + return 0; } /* From 351619fc99883d22ba1018b5914ae717bfef4221 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:11 +0200 Subject: [PATCH 060/140] timekeeping: Rework change_clocksource() to use shadow_timekeeper Updates of the timekeeper can be done by operating on the shadow timekeeper and afterwards copying the result into the real timekeeper. This has the advantage, that the sequence count write protected region is kept as small as possible. Convert change_clocksource() to use this scheme. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-18-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7e865f05793596..f77782f557ce9e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1627,9 +1627,7 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) */ static int change_clocksource(void *data) { - struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *new = data, *old = NULL; - unsigned long flags; /* * If the clocksource is in a module, get a module reference. @@ -1645,16 +1643,14 @@ static int change_clocksource(void *data) return 0; } - raw_spin_lock_irqsave(&tk_core.lock, flags); - write_seqcount_begin(&tk_core.seq); - - timekeeping_forward_now(tk); - old = tk->tkr_mono.clock; - tk_setup_internals(tk, new); - timekeeping_update(&tk_core, tk, TK_UPDATE_ALL | TK_MIRROR); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&tk_core.lock, flags); + timekeeping_forward_now(tks); + old = tks->tkr_mono.clock; + tk_setup_internals(tks, new); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } if (old) { if (old->disable) From 2cab490b41b28a4239baf810ca1bb1c9d6d017ca Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:12 +0200 Subject: [PATCH 061/140] timekeeping: Rework timekeeping_init() to use shadow_timekeeper For timekeeping_init() the sequence count write held time is not relevant and it could keep working on the real timekeeper, but there is no reason to make it different from other timekeeper updates. Convert it to operate on the shadow timekeeper. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-19-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f77782f557ce9e..4e0037d342d84e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1809,7 +1809,7 @@ static bool persistent_clock_exists; void __init timekeeping_init(void) { struct timespec64 wall_time, boot_offset, wall_to_mono; - struct timekeeper *tk = &tk_core.timekeeper; + struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; tkd_basic_setup(&tk_core); @@ -1833,22 +1833,20 @@ void __init timekeeping_init(void) wall_to_mono = timespec64_sub(boot_offset, wall_time); guard(raw_spinlock_irqsave)(&tk_core.lock); - write_seqcount_begin(&tk_core.seq); + ntp_init(); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - tk_setup_internals(tk, clock); + tk_setup_internals(tks, clock); - tk_set_xtime(tk, &wall_time); - tk->raw_sec = 0; + tk_set_xtime(tks, &wall_time); + tks->raw_sec = 0; - tk_set_wall_to_mono(tk, wall_to_mono); + tk_set_wall_to_mono(tks, wall_to_mono); - timekeeping_update(&tk_core, tk, TK_MIRROR | TK_CLOCK_WAS_SET); - - write_seqcount_end(&tk_core.seq); + timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); } /* time in seconds when suspend began for persistent clock */ From 2b473e65dea6be1a60d357f0afe46ecb6bf91501 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:13 +0200 Subject: [PATCH 062/140] timekeeping: Rework timekeeping_inject_sleeptime64() to use shadow_timekeeper Updates of the timekeeper can be done by operating on the shadow timekeeper and afterwards copying the result into the real timekeeper. This has the advantage, that the sequence count write protected region is kept as small as possible. Convert timekeeping_inject_sleeptime64() to use this scheme. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-20-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4e0037d342d84e..9552bc76f3863a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1924,22 +1924,14 @@ bool timekeeping_rtc_skipsuspend(void) */ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - - raw_spin_lock_irqsave(&tk_core.lock, flags); - write_seqcount_begin(&tk_core.seq); - - suspend_timing_needed = false; - - timekeeping_forward_now(tk); - - __timekeeping_inject_sleeptime(tk, delta); - - timekeeping_update(&tk_core, tk, TK_UPDATE_ALL | TK_MIRROR); + scoped_guard(raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&tk_core.lock, flags); + suspend_timing_needed = false; + timekeeping_forward_now(tks); + __timekeeping_inject_sleeptime(tks, delta); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); From b2350d954dca14dfde95e7512ad521ccab0e4108 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:14 +0200 Subject: [PATCH 063/140] timekeeping: Rework timekeeping_resume() to use shadow_timekeeper Updates of the timekeeper can be done by operating on the shadow timekeeper and afterwards copying the result into the real timekeeper. This has the advantage, that the sequence count write protected region is kept as small as possible. While the sequence count held time is not relevant for the resume path as there is no concurrency, there is no reason to have this function different than all the other update sites. Convert timekeeping_inject_offset() to use this scheme and cleanup the variable declaration while at it. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-21-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9552bc76f3863a..94f68e7ffc9df2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1943,12 +1943,12 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) */ void timekeeping_resume(void) { - struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *clock = tk->tkr_mono.clock; - unsigned long flags; + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct clocksource *clock = tks->tkr_mono.clock; struct timespec64 ts_new, ts_delta; - u64 cycle_now, nsec; bool inject_sleeptime = false; + u64 cycle_now, nsec; + unsigned long flags; read_persistent_clock64(&ts_new); @@ -1956,7 +1956,6 @@ void timekeeping_resume(void) clocksource_resume(); raw_spin_lock_irqsave(&tk_core.lock, flags); - write_seqcount_begin(&tk_core.seq); /* * After system resumes, we need to calculate the suspended time and @@ -1970,7 +1969,7 @@ void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk_clock_read(&tk->tkr_mono); + cycle_now = tk_clock_read(&tks->tkr_mono); nsec = clocksource_stop_suspend_timing(clock, cycle_now); if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); @@ -1982,17 +1981,16 @@ void timekeeping_resume(void) if (inject_sleeptime) { suspend_timing_needed = false; - __timekeeping_inject_sleeptime(tk, &ts_delta); + __timekeeping_inject_sleeptime(tks, &ts_delta); } /* Re-base the last cycle value */ - tk->tkr_mono.cycle_last = cycle_now; - tk->tkr_raw.cycle_last = cycle_now; + tks->tkr_mono.cycle_last = cycle_now; + tks->tkr_raw.cycle_last = cycle_now; - tk->ntp_error = 0; + tks->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(&tk_core, tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&tk_core.seq); + timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); raw_spin_unlock_irqrestore(&tk_core.lock, flags); touch_softlockup_watchdog(); From d05eae87764ed28a3caf08220d0e2f72dbc0f596 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:15 +0200 Subject: [PATCH 064/140] timekeeping: Rework timekeeping_suspend() to use shadow_timekeeper Updates of the timekeeper can be done by operating on the shadow timekeeper and afterwards copying the result into the real timekeeper. This has the advantage, that the sequence count write protected region is kept as small as possible. While the sequence count held time is not relevant for the resume path as there is no concurrency, there is no reason to have this function different than all the other update sites. Convert timekeeping_inject_offset() to use this scheme and cleanup the variable declarations while at it. As halt_fast_timekeeper() does not need protection sequence counter, it is no problem to move it with this change outside of the sequence counter protected area. But it still needs to be executed while holding the lock. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-22-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 94f68e7ffc9df2..231eaa43a94a6b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2003,11 +2003,11 @@ void timekeeping_resume(void) int timekeeping_suspend(void) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - struct timespec64 delta, delta_delta; - static struct timespec64 old_delta; + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct timespec64 delta, delta_delta; + static struct timespec64 old_delta; struct clocksource *curr_clock; + unsigned long flags; u64 cycle_now; read_persistent_clock64(&timekeeping_suspend_time); @@ -2023,8 +2023,7 @@ int timekeeping_suspend(void) suspend_timing_needed = true; raw_spin_lock_irqsave(&tk_core.lock, flags); - write_seqcount_begin(&tk_core.seq); - timekeeping_forward_now(tk); + timekeeping_forward_now(tks); timekeeping_suspended = 1; /* @@ -2032,8 +2031,8 @@ int timekeeping_suspend(void) * just read from the current clocksource. Save this to potentially * use in suspend timing. */ - curr_clock = tk->tkr_mono.clock; - cycle_now = tk->tkr_mono.cycle_last; + curr_clock = tks->tkr_mono.clock; + cycle_now = tks->tkr_mono.cycle_last; clocksource_start_suspend_timing(curr_clock, cycle_now); if (persistent_clock_exists) { @@ -2043,7 +2042,7 @@ int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time); delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -2058,9 +2057,8 @@ int timekeeping_suspend(void) } } - timekeeping_update(&tk_core, tk, TK_MIRROR); - halt_fast_timekeeper(tk); - write_seqcount_end(&tk_core.seq); + timekeeping_update_from_shadow(&tk_core, 0); + halt_fast_timekeeper(tks); raw_spin_unlock_irqrestore(&tk_core.lock, flags); tick_suspend(); From ae455cb7b8ad2c1a3947394d448912fa2385f7d2 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:16 +0200 Subject: [PATCH 065/140] timekeeping: Rework do_adjtimex() to use shadow_timekeeper Updates of the timekeeper can be done by operating on the shadow timekeeper and afterwards copying the result into the real timekeeper. This has the advantage, that the sequence count write protected region is kept as small as possible. Convert do_adjtimex() to use this scheme and take the opportunity to use a scoped_guard() for locking. That requires to have a separate function for updating the leap state so that the update is protected by the sequence count. This also brings the timekeeper and the shadow timekeeper in sync for this state, which was not the case so far. That's not a correctness problem as the state is only used at the read sides which use the real timekeeper, but it's inconsistent nevertheless. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-23-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 41 ++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 231eaa43a94a6b..f1179825a9a9c3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -741,6 +741,18 @@ static inline void tk_update_leap_state(struct timekeeper *tk) tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); } +/* + * Leap state update for both shadow and the real timekeeper + * Separate to spare a full memcpy() of the timekeeper. + */ +static void tk_update_leap_state_all(struct tk_data *tkd) +{ + write_seqcount_begin(&tkd->seq); + tk_update_leap_state(&tkd->shadow_timekeeper); + tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime; + write_seqcount_end(&tkd->seq); +} + /* * Update the ktime_t based scalar nsec members of the timekeeper */ @@ -2656,13 +2668,10 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback); */ int do_adjtimex(struct __kernel_timex *txc) { - struct timekeeper *tk = &tk_core.timekeeper; struct audit_ntp_data ad; bool offset_set = false; bool clock_set = false; struct timespec64 ts; - unsigned long flags; - s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ @@ -2673,6 +2682,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (txc->modes & ADJ_SETOFFSET) { struct timespec64 delta; + delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) @@ -2690,23 +2700,22 @@ int do_adjtimex(struct __kernel_timex *txc) ktime_get_real_ts64(&ts); add_device_randomness(&ts, sizeof(ts)); - raw_spin_lock_irqsave(&tk_core.lock, flags); - write_seqcount_begin(&tk_core.seq); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; + s32 orig_tai, tai; - orig_tai = tai = tk->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai, &ad); + orig_tai = tai = tks->tai_offset; + ret = __do_adjtimex(txc, &ts, &tai, &ad); - if (tai != orig_tai) { - __timekeeping_set_tai_offset(tk, tai); - timekeeping_update(&tk_core, tk, TK_MIRROR | TK_CLOCK_WAS_SET); - clock_set = true; - } else { - tk_update_leap_state(tk); + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tks, tai); + timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); + clock_set = true; + } else { + tk_update_leap_state_all(&tk_core); + } } - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&tk_core.lock, flags); - audit_ntp_log(&ad); /* Update the multiplier immediately if frequency was set directly */ From 0026766dfd699cf217beae5ac92cd153a30b60b0 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:17 +0200 Subject: [PATCH 066/140] timekeeping: Remove TK_MIRROR timekeeping_update() action All call sites of using TK_MIRROR flag in timekeeping_update() are gone. The TK_MIRROR dependent code path is therefore dead code. Remove it along with the TK_MIRROR define. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-24-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f1179825a9a9c3..6ca250ab2c20fa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -30,8 +30,7 @@ #include "timekeeping_internal.h" #define TK_CLEAR_NTP (1 << 0) -#define TK_MIRROR (1 << 1) -#define TK_CLOCK_WAS_SET (1 << 2) +#define TK_CLOCK_WAS_SET (1 << 1) #define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET) @@ -816,13 +815,6 @@ static void timekeeping_update(struct tk_data *tkd, struct timekeeper *tk, unsig if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; - /* - * The mirroring of the data to the shadow-timekeeper needs - * to happen last here to ensure we don't over-write the - * timekeeper structure on the next update with stale data - */ - if (action & TK_MIRROR) - timekeeping_restore_shadow(tkd); } static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) From 147ba943024e564e89d9ac265d6a07a0d2c03988 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 9 Oct 2024 10:29:18 +0200 Subject: [PATCH 067/140] timekeeping: Merge timekeeping_update_staged() and timekeeping_update() timekeeping_update_staged() is the only call site of timekeeping_update(). Merge those functions. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241009-devel-anna-maria-b4-timers-ptp-timekeeping-v2-25-554456a44a15@linutronix.de --- kernel/time/timekeeping.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6ca250ab2c20fa..17cae886ca8269 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -548,7 +548,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * timekeeping_inject_sleeptime64() * __timekeeping_inject_sleeptime(tk, delta); * timestamp(); - * timekeeping_update(tkd, tk, TK_CLEAR_NTP...); + * timekeeping_update_staged(tkd, TK_CLEAR_NTP...); * * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this @@ -794,10 +794,21 @@ static void timekeeping_restore_shadow(struct tk_data *tkd) memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper)); } -static void timekeeping_update(struct tk_data *tkd, struct timekeeper *tk, unsigned int action) +static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { + struct timekeeper *tk = &tk_core.shadow_timekeeper; + lockdep_assert_held(&tkd->lock); + /* + * Block out readers before running the updates below because that + * updates VDSO and other time related infrastructure. Not blocking + * the readers might let a reader see time going backwards when + * reading from the VDSO after the VDSO update and then reading in + * the kernel from the timekeeper before that got updated. + */ + write_seqcount_begin(&tkd->seq); + if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(); @@ -815,20 +826,6 @@ static void timekeeping_update(struct tk_data *tkd, struct timekeeper *tk, unsig if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; -} - -static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) -{ - /* - * Block out readers before invoking timekeeping_update() because - * that updates VDSO and other time related infrastructure. Not - * blocking the readers might let a reader see time going backwards - * when reading from the VDSO after the VDSO update and then - * reading in the kernel from the timekeeper before that got updated. - */ - write_seqcount_begin(&tkd->seq); - - timekeeping_update(tkd, &tkd->shadow_timekeeper, action); /* * Update the real timekeeper. @@ -838,7 +835,7 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act * the cacheline optimized data layout of the timekeeper and requires * another indirection. */ - memcpy(&tkd->timekeeper, &tkd->shadow_timekeeper, sizeof(tkd->shadow_timekeeper)); + memcpy(&tkd->timekeeper, tk, sizeof(*tk)); write_seqcount_end(&tkd->seq); } From b05aefc1f5886c8aece650c9c1639c87b976191a Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 25 Oct 2024 13:01:40 +0200 Subject: [PATCH 068/140] time: Partially revert cleanup on msecs_to_jiffies() documentation The documentation's intention is to compare msecs_to_jiffies() (first sentence) with __msecs_to_jiffies() (second sentence), which is what the original documentation did. One of the cleanups in commit f3cb80804b82 ("time: Fix various kernel-doc problems") may have thought the paragraph was talking about the latter since that is what it is being documented. Thus revert that part of the change. Fixes: f3cb80804b82 ("time: Fix various kernel-doc problems") Signed-off-by: Miguel Ojeda Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241025110141.157205-1-ojeda@kernel.org --- kernel/time/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/time.c b/kernel/time/time.c index 5984d4a5639bd5..b1809a1b0827d3 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -558,7 +558,7 @@ EXPORT_SYMBOL(ns_to_timespec64); * handling any 32-bit overflows. * for the details see __msecs_to_jiffies() * - * __msecs_to_jiffies() checks for the passed in value being a constant + * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code, __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at From 92b043fd995a63a57aae29ff85a39b6f30cd440c Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 25 Oct 2024 13:01:41 +0200 Subject: [PATCH 069/140] time: Fix references to _msecs_to_jiffies() handling of values The details about the handling of the "normal" values were moved to the _msecs_to_jiffies() helpers in commit ca42aaf0c861 ("time: Refactor msecs_to_jiffies"). However, the same commit still mentioned __msecs_to_jiffies() in the added documentation. Thus point to _msecs_to_jiffies() instead. Fixes: ca42aaf0c861 ("time: Refactor msecs_to_jiffies") Signed-off-by: Miguel Ojeda Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241025110141.157205-2-ojeda@kernel.org --- include/linux/jiffies.h | 2 +- kernel/time/time.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 1220f0fbe5bf9f..5d21dacd62bc7d 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -502,7 +502,7 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m) * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. - * for the details see __msecs_to_jiffies() + * for the details see _msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the diff --git a/kernel/time/time.c b/kernel/time/time.c index b1809a1b0827d3..1b69caa87480e7 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -556,7 +556,7 @@ EXPORT_SYMBOL(ns_to_timespec64); * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. - * for the details see __msecs_to_jiffies() + * for the details see _msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the From 68f99be287a59d50a9ad231d523f7e578f8bd28a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:00 +0200 Subject: [PATCH 070/140] signal: Confine POSIX_TIMERS properly Move the itimer rearming out of the signal code and consolidate all posix timer related functions in the signal code under one ifdef. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241001083835.314100569@linutronix.de --- include/linux/posix-timers.h | 5 +- kernel/signal.c | 125 ++++++++++++----------------------- kernel/time/itimer.c | 22 +++++- kernel/time/posix-timers.c | 15 ++++- 4 files changed, 81 insertions(+), 86 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 45369171083964..670bf03a56ef43 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -100,6 +100,8 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, { pct->bases[CPUCLOCK_SCHED].nextevt = runtime; } +void posixtimer_rearm_itimer(struct task_struct *p); +void posixtimer_rearm(struct kernel_siginfo *info); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ @@ -122,6 +124,8 @@ struct cpu_timer { }; static inline void posix_cputimers_init(struct posix_cputimers *pct) { } static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } +static inline void posixtimer_rearm_itimer(struct task_struct *p) { } +static inline void posixtimer_rearm(struct kernel_siginfo *info) { } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK @@ -196,5 +200,4 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); -void posixtimer_rearm(struct kernel_siginfo *info); #endif diff --git a/kernel/signal.c b/kernel/signal.c index 4344860ffcac73..b65cc1853a093c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -478,42 +478,6 @@ void flush_signals(struct task_struct *t) } EXPORT_SYMBOL(flush_signals); -#ifdef CONFIG_POSIX_TIMERS -static void __flush_itimer_signals(struct sigpending *pending) -{ - sigset_t signal, retain; - struct sigqueue *q, *n; - - signal = pending->signal; - sigemptyset(&retain); - - list_for_each_entry_safe(q, n, &pending->list, list) { - int sig = q->info.si_signo; - - if (likely(q->info.si_code != SI_TIMER)) { - sigaddset(&retain, sig); - } else { - sigdelset(&signal, sig); - list_del_init(&q->list); - __sigqueue_free(q); - } - } - - sigorsets(&pending->signal, &signal, &retain); -} - -void flush_itimer_signals(void) -{ - struct task_struct *tsk = current; - unsigned long flags; - - spin_lock_irqsave(&tsk->sighand->siglock, flags); - __flush_itimer_signals(&tsk->pending); - __flush_itimer_signals(&tsk->signal->shared_pending); - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); -} -#endif - void ignore_signals(struct task_struct *t) { int i; @@ -636,31 +600,9 @@ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) *type = PIDTYPE_TGID; signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info, &resched_timer); -#ifdef CONFIG_POSIX_TIMERS - /* - * itimer signal ? - * - * itimers are process shared and we restart periodic - * itimers in the signal delivery path to prevent DoS - * attacks in the high resolution timer case. This is - * compliant with the old way of self-restarting - * itimers, as the SIGALRM is a legacy signal and only - * queued once. Changing the restart behaviour to - * restart the timer in the signal dequeue path is - * reducing the timer noise on heavy loaded !highres - * systems too. - */ - if (unlikely(signr == SIGALRM)) { - struct hrtimer *tmr = &tsk->signal->real_timer; - - if (!hrtimer_is_queued(tmr) && - tsk->signal->it_real_incr != 0) { - hrtimer_forward(tmr, tmr->base->get_time(), - tsk->signal->it_real_incr); - hrtimer_restart(tmr); - } - } -#endif + + if (unlikely(signr == SIGALRM)) + posixtimer_rearm_itimer(tsk); } recalc_sigpending(); @@ -682,22 +624,12 @@ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) */ current->jobctl |= JOBCTL_STOP_DEQUEUED; } -#ifdef CONFIG_POSIX_TIMERS - if (resched_timer) { - /* - * Release the siglock to ensure proper locking order - * of timer locks outside of siglocks. Note, we leave - * irqs disabled here, since the posix-timers code is - * about to disable them again anyway. - */ - spin_unlock(&tsk->sighand->siglock); - posixtimer_rearm(info); - spin_lock(&tsk->sighand->siglock); - /* Don't expose the si_sys_private value to userspace */ - info->si_sys_private = 0; + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { + if (unlikely(resched_timer)) + posixtimer_rearm(info); } -#endif + return signr; } EXPORT_SYMBOL_GPL(dequeue_signal); @@ -1922,15 +1854,43 @@ int kill_pid(struct pid *pid, int sig, int priv) } EXPORT_SYMBOL(kill_pid); +#ifdef CONFIG_POSIX_TIMERS /* - * These functions support sending signals using preallocated sigqueue - * structures. This is needed "because realtime applications cannot - * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of POSIX Timers - * we allocate the sigqueue structure from the timer_create. If this - * allocation fails we are able to report the failure to the application - * with an EAGAIN error. + * These functions handle POSIX timer signals. POSIX timers use + * preallocated sigqueue structs for sending signals. */ +static void __flush_itimer_signals(struct sigpending *pending) +{ + sigset_t signal, retain; + struct sigqueue *q, *n; + + signal = pending->signal; + sigemptyset(&retain); + + list_for_each_entry_safe(q, n, &pending->list, list) { + int sig = q->info.si_signo; + + if (likely(q->info.si_code != SI_TIMER)) { + sigaddset(&retain, sig); + } else { + sigdelset(&signal, sig); + list_del_init(&q->list); + __sigqueue_free(q); + } + } + + sigorsets(&pending->signal, &signal, &retain); +} + +void flush_itimer_signals(void) +{ + struct task_struct *tsk = current; + + guard(spinlock_irqsave)(&tsk->sighand->siglock); + __flush_itimer_signals(&tsk->pending); + __flush_itimer_signals(&tsk->signal->shared_pending); +} + struct sigqueue *sigqueue_alloc(void) { return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC); @@ -2027,6 +1987,7 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) rcu_read_unlock(); return ret; } +#endif /* CONFIG_POSIX_TIMERS */ void do_notify_pidfd(struct task_struct *task) { diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 00629e658ca196..876d389b2e219a 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -151,7 +151,27 @@ COMPAT_SYSCALL_DEFINE2(getitimer, int, which, #endif /* - * The timer is automagically restarted, when interval != 0 + * Invoked from dequeue_signal() when SIG_ALRM is delivered. + * + * Restart the ITIMER_REAL timer if it is armed as periodic timer. Doing + * this in the signal delivery path instead of self rearming prevents a DoS + * with small increments in the high reolution timer case and reduces timer + * noise in general. + */ +void posixtimer_rearm_itimer(struct task_struct *tsk) +{ + struct hrtimer *tmr = &tsk->signal->real_timer; + + if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) { + hrtimer_forward(tmr, tmr->base->get_time(), + tsk->signal->it_real_incr); + hrtimer_restart(tmr); + } +} + +/* + * Interval timers are restarted in the signal delivery path. See + * posixtimer_rearm_itimer(). */ enum hrtimer_restart it_real_fn(struct hrtimer *timer) { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index fc40dacabe7822..d461a32b726082 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -251,7 +251,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr) /* * This function is called from the signal delivery code if - * info->si_sys_private is not zero, which indicates that the timer has to + * info::si_sys_private is not zero, which indicates that the timer has to * be rearmed. Restart the timer and update info::si_overrun. */ void posixtimer_rearm(struct kernel_siginfo *info) @@ -259,9 +259,15 @@ void posixtimer_rearm(struct kernel_siginfo *info) struct k_itimer *timr; unsigned long flags; + /* + * Release siglock to ensure proper locking order versus + * timr::it_lock. Keep interrupts disabled. + */ + spin_unlock(¤t->sighand->siglock); + timr = lock_timer(info->si_tid, &flags); if (!timr) - return; + goto out; if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) { timr->kclock->timer_rearm(timr); @@ -275,6 +281,11 @@ void posixtimer_rearm(struct kernel_siginfo *info) } unlock_timer(timr, flags); +out: + spin_lock(¤t->sighand->siglock); + + /* Don't expose the si_sys_private value to userspace */ + info->si_sys_private = 0; } int posix_timer_queue_signal(struct k_itimer *timr) From a76e1bbe879cf39952ec4b43ed653b0905635f24 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:02 +0200 Subject: [PATCH 071/140] signal: Cleanup flush_sigqueue_mask() Mop up the stale return value comment and add a lockdep check instead of commenting on the locking requirement. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241001083835.374933959@linutronix.de --- kernel/signal.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index b65cc1853a093c..f420c430b24a55 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -704,17 +704,14 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) kick_process(t); } -/* - * Remove signals in mask from the pending set and queue. - * Returns 1 if any signals were found. - * - * All callers must be holding the siglock. - */ -static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) +/* Remove signals in mask from the pending set and queue. */ +static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; + lockdep_assert_held(&p->sighand->siglock); + sigandsets(&m, mask, &s->signal); if (sigisemptyset(&m)) return; @@ -848,18 +845,18 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * This is a stop signal. Remove SIGCONT from all queues. */ siginitset(&flush, sigmask(SIGCONT)); - flush_sigqueue_mask(&flush, &signal->shared_pending); + flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) - flush_sigqueue_mask(&flush, &t->pending); + flush_sigqueue_mask(p, &flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ siginitset(&flush, SIG_KERNEL_STOP_MASK); - flush_sigqueue_mask(&flush, &signal->shared_pending); + flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) { - flush_sigqueue_mask(&flush, &t->pending); + flush_sigqueue_mask(p, &flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); if (likely(!(t->ptrace & PT_SEIZED))) { t->jobctl &= ~JOBCTL_STOPPED; @@ -4114,8 +4111,8 @@ void kernel_sigaction(int sig, __sighandler_t action) sigemptyset(&mask); sigaddset(&mask, sig); - flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); - flush_sigqueue_mask(&mask, ¤t->pending); + flush_sigqueue_mask(current, &mask, ¤t->signal->shared_pending); + flush_sigqueue_mask(current, &mask, ¤t->pending); recalc_sigpending(); } spin_unlock_irq(¤t->sighand->siglock); @@ -4182,9 +4179,9 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (sig_handler_ignored(sig_handler(p, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); - flush_sigqueue_mask(&mask, &p->signal->shared_pending); + flush_sigqueue_mask(p, &mask, &p->signal->shared_pending); for_each_thread(p, t) - flush_sigqueue_mask(&mask, &t->pending); + flush_sigqueue_mask(p, &mask, &t->pending); } } From 4febce44cfebcb490b196d5d10ae9f403ca4c956 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:03 +0200 Subject: [PATCH 072/140] posix-timers: Cure si_sys_private race The si_sys_private member of the siginfo which is embedded in the preallocated sigqueue is used by the posix timer code to decide whether a timer must be reprogrammed on signal delivery. The handling of this is racy as a long standing comment in that code documents. It is modified with the timer lock held, but without sighand lock being held. The actual signal delivery code checks for it under sighand lock without holding the timer lock. Hand the new value to send_sigqueue() as argument and store it with sighand lock held. This is an intermediate change to address this issue. The arguments to this function will be cleanup in subsequent changes. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241001083835.434338954@linutronix.de --- include/linux/sched/signal.h | 2 +- kernel/signal.c | 10 +++++++++- kernel/time/posix-timers.c | 15 +-------------- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index c8ed09ac29ac58..bd9f569231d98a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -340,7 +340,7 @@ extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); +extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type, int si_private); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) diff --git a/kernel/signal.c b/kernel/signal.c index f420c430b24a55..1563c83ff224f1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1919,7 +1919,7 @@ void sigqueue_free(struct sigqueue *q) __sigqueue_free(q); } -int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) +int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type, int si_private) { int sig = q->info.si_signo; struct sigpending *pending; @@ -1954,6 +1954,14 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) if (!likely(lock_task_sighand(t, &flags))) goto ret; + /* + * Update @q::info::si_sys_private for posix timer signals with + * sighand locked to prevent a race against dequeue_signal() which + * decides based on si_sys_private whether to invoke + * posixtimer_rearm() or not. + */ + q->info.si_sys_private = si_private; + ret = 1; /* the signal is ignored */ result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, false)) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d461a32b726082..05af074285fa47 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -299,21 +299,8 @@ int posix_timer_queue_signal(struct k_itimer *timr) if (timr->it_interval) si_private = ++timr->it_requeue_pending; - /* - * FIXME: if ->sigq is queued we can race with - * dequeue_signal()->posixtimer_rearm(). - * - * If dequeue_signal() sees the "right" value of - * si_sys_private it calls posixtimer_rearm(). - * We re-queue ->sigq and drop ->it_lock(). - * posixtimer_rearm() locks the timer - * and re-schedules it while ->sigq is pending. - * Not really bad, but not that we want. - */ - timr->sigq->info.si_sys_private = si_private; - type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; - ret = send_sigqueue(timr->sigq, timr->it_pid, type); + ret = send_sigqueue(timr->sigq, timr->it_pid, type, si_private); /* If we failed to send the signal the timer stops. */ return ret > 0; } From c775ea28d4e23f5e58b6953645ef90c1b27a8e83 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:04 +0200 Subject: [PATCH 073/140] signal: Allow POSIX timer signals to be dropped In case that a timer was reprogrammed or deleted an already pending signal is obsolete. Right now such signals are kept around and eventually delivered. While POSIX is blury about this: - "The effect of disarming or resetting a timer with pending expiration notifications is unspecified." - "The disposition of pending signals for the deleted timer is unspecified." it is reasonable in both cases to expect that pending signals are discarded as they have no meaning anymore. Prepare the signal code to allow dropping posix timer signals. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241001083835.494416923@linutronix.de --- include/linux/posix-timers.h | 5 +++-- kernel/signal.c | 7 ++++--- kernel/time/posix-timers.c | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 670bf03a56ef43..4ab49e5c42af8c 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -100,8 +100,9 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, { pct->bases[CPUCLOCK_SCHED].nextevt = runtime; } + void posixtimer_rearm_itimer(struct task_struct *p); -void posixtimer_rearm(struct kernel_siginfo *info); +bool posixtimer_deliver_signal(struct kernel_siginfo *info); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ @@ -125,7 +126,7 @@ static inline void posix_cputimers_init(struct posix_cputimers *pct) { } static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } static inline void posixtimer_rearm_itimer(struct task_struct *p) { } -static inline void posixtimer_rearm(struct kernel_siginfo *info) { } +static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info) { return false; } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK diff --git a/kernel/signal.c b/kernel/signal.c index 1563c83ff224f1..df34aa47181e6c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -594,6 +594,7 @@ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) lockdep_assert_held(&tsk->sighand->siglock); +again: *type = PIDTYPE_PID; signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); if (!signr) { @@ -625,9 +626,9 @@ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) current->jobctl |= JOBCTL_STOP_DEQUEUED; } - if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { - if (unlikely(resched_timer)) - posixtimer_rearm(info); + if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(resched_timer)) { + if (!posixtimer_deliver_signal(info)) + goto again; } return signr; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 05af074285fa47..dd0b1dff54d9e9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -254,7 +254,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr) * info::si_sys_private is not zero, which indicates that the timer has to * be rearmed. Restart the timer and update info::si_overrun. */ -void posixtimer_rearm(struct kernel_siginfo *info) +bool posixtimer_deliver_signal(struct kernel_siginfo *info) { struct k_itimer *timr; unsigned long flags; @@ -286,6 +286,7 @@ void posixtimer_rearm(struct kernel_siginfo *info) /* Don't expose the si_sys_private value to userspace */ info->si_sys_private = 0; + return true; } int posix_timer_queue_signal(struct k_itimer *timr) From 2860d4d315dc01f001dfd328adaf2ab440c47dd3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:06 +0200 Subject: [PATCH 074/140] posix-timers: Drop signal if timer has been deleted or reprogrammed No point in delivering a signal from the past. POSIX does not specify the behaviour here: - "The effect of disarming or resetting a timer with pending expiration notifications is unspecified." - "The disposition of pending signals for the deleted timer is unspecified." In both cases it is reasonable to expect that pending signals are discarded. Especially in the reprogramming case it does not make sense to account for previous overruns or to deliver a signal for a timer which has been disarmed. Drop the signal as that is conistent and understandable behaviour. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241001083835.553646280@linutronix.de --- kernel/time/posix-timers.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd0b1dff54d9e9..22e1d6bf349b79 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -250,14 +250,14 @@ static void common_hrtimer_rearm(struct k_itimer *timr) } /* - * This function is called from the signal delivery code if - * info::si_sys_private is not zero, which indicates that the timer has to - * be rearmed. Restart the timer and update info::si_overrun. + * This function is called from the signal delivery code. It decides + * whether the signal should be dropped and rearms interval timers. */ bool posixtimer_deliver_signal(struct kernel_siginfo *info) { struct k_itimer *timr; unsigned long flags; + bool ret = false; /* * Release siglock to ensure proper locking order versus @@ -279,6 +279,7 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); } + ret = true; unlock_timer(timr, flags); out: @@ -286,7 +287,7 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) /* Don't expose the si_sys_private value to userspace */ info->si_sys_private = 0; - return true; + return ret; } int posix_timer_queue_signal(struct k_itimer *timr) From cd1e93aedab7f749760a33e9e094381973b1120e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:07 +0200 Subject: [PATCH 075/140] posix-timers: Rename k_itimer:: It_requeue_pending Prepare for using this struct member to do a proper reprogramming and deletion accounting so that stale signals can be dropped. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241001083835.611997737@linutronix.de --- include/linux/posix-timers.h | 5 ++--- kernel/time/alarmtimer.c | 2 +- kernel/time/posix-cpu-timers.c | 4 ++-- kernel/time/posix-timers.c | 12 ++++++------ 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 4ab49e5c42af8c..253d106fac2cb8 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -150,8 +150,7 @@ static inline void posix_cputimers_init_work(void) { } * @it_active: Marker that timer is active * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal - * @it_requeue_pending: Indicator that timer waits for being requeued on - * signal delivery + * @it_signal_seq: Sequence count to control signal delivery * @it_sigev_notify: The notify word of sigevent struct for signal delivery * @it_interval: The interval for periodic timers * @it_signal: Pointer to the creators signal struct @@ -172,7 +171,7 @@ struct k_itimer { int it_active; s64 it_overrun; s64 it_overrun_last; - int it_requeue_pending; + unsigned int it_signal_seq; int it_sigev_notify; ktime_t it_interval; struct signal_struct *it_signal; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8bf88864169491..75f84438507093 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -584,7 +584,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, * small intervals cannot starve the system. */ ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true); - ++ptr->it_requeue_pending; + ++ptr->it_signal_seq; ptr->it_active = 1; result = ALARMTIMER_RESTART; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 6bcee470405926..993243b5be987b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -608,7 +608,7 @@ static void cpu_timer_fire(struct k_itimer *timer) * ticking in case the signal is deliverable next time. */ posix_cpu_timer_rearm(timer); - ++timer->it_requeue_pending; + ++timer->it_signal_seq; } } @@ -745,7 +745,7 @@ static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *i * - Timers which expired, but the signal has not yet been * delivered */ - if (iv && ((timer->it_requeue_pending & REQUEUE_PENDING) || sigev_none)) + if (iv && ((timer->it_signal_seq & REQUEUE_PENDING) || sigev_none)) expires = bump_cpu_timer(timer, now); else expires = cpu_timer_getexpires(&timer->it.cpu); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 22e1d6bf349b79..fd321fcc3f6cc6 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -269,13 +269,13 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) if (!timr) goto out; - if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) { + if (timr->it_interval && timr->it_signal_seq == info->si_sys_private) { timr->kclock->timer_rearm(timr); timr->it_active = 1; timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1LL; - ++timr->it_requeue_pending; + ++timr->it_signal_seq; info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); } @@ -299,7 +299,7 @@ int posix_timer_queue_signal(struct k_itimer *timr) timr->it_active = 0; if (timr->it_interval) - si_private = ++timr->it_requeue_pending; + si_private = ++timr->it_signal_seq; type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; ret = send_sigqueue(timr->sigq, timr->it_pid, type, si_private); @@ -366,7 +366,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval); ret = HRTIMER_RESTART; - ++timr->it_requeue_pending; + ++timr->it_signal_seq; timr->it_active = 1; } } @@ -660,7 +660,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) * is a SIGEV_NONE timer move the expiry time forward by intervals, * so expiry is > now. */ - if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) + if (iv && (timr->it_signal_seq & REQUEUE_PENDING || sig_none)) timr->it_overrun += kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); @@ -861,7 +861,7 @@ void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_set timer->it_interval = 0; /* Prevent reloading in case there is a signal pending */ - timer->it_requeue_pending = (timer->it_requeue_pending + 2) & ~REQUEUE_PENDING; + timer->it_signal_seq = (timer->it_signal_seq + 2) & ~REQUEUE_PENDING; /* Reset overrun accounting */ timer->it_overrun_last = 0; timer->it_overrun = -1LL; From 1550dde8a537b35dbf066c7f9cfe5f9b360bce0d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:09 +0200 Subject: [PATCH 076/140] posix-timers: Add proper state tracking Right now the state tracking is done by two struct members: - it_active: A boolean which tracks armed/disarmed state - it_signal_seq: A sequence counter which is used to invalidate settings and prevent rearming Replace it_active with it_status and keep properly track about the states in one place. This allows to reuse it_signal_seq to track reprogramming, disarm and delete operations in order to drop signals which are related to the state previous of those operations. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241001083835.670337048@linutronix.de --- include/linux/posix-timers.h | 4 ++-- kernel/time/alarmtimer.c | 2 +- kernel/time/posix-cpu-timers.c | 15 ++++++++------- kernel/time/posix-timers.c | 22 +++++++++++++--------- kernel/time/posix-timers.h | 6 ++++++ 5 files changed, 30 insertions(+), 19 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 253d106fac2cb8..02afbb4da7f783 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -147,7 +147,7 @@ static inline void posix_cputimers_init_work(void) { } * @kclock: Pointer to the k_clock struct handling this timer * @it_clock: The posix timer clock id * @it_id: The posix timer id for identifying the timer - * @it_active: Marker that timer is active + * @it_status: The status of the timer * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_signal_seq: Sequence count to control signal delivery @@ -168,7 +168,7 @@ struct k_itimer { const struct k_clock *kclock; clockid_t it_clock; timer_t it_id; - int it_active; + int it_status; s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 75f84438507093..452d8aa2f6e013 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -585,7 +585,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, */ ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true); ++ptr->it_signal_seq; - ptr->it_active = 1; + ptr->it_status = POSIX_TIMER_ARMED; result = ALARMTIMER_RESTART; } spin_unlock_irqrestore(&ptr->it_lock, flags); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 993243b5be987b..12f828d704b1b8 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -453,7 +453,6 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p) struct cpu_timer *ctmr = &timer->it.cpu; struct posix_cputimer_base *base; - timer->it_active = 0; if (!cpu_timer_dequeue(ctmr)) return; @@ -494,11 +493,12 @@ static int posix_cpu_timer_del(struct k_itimer *timer) */ WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node)); } else { - if (timer->it.cpu.firing) + if (timer->it.cpu.firing) { ret = TIMER_RETRY; - else + } else { disarm_timer(timer, p); - + timer->it_status = POSIX_TIMER_DISARMED; + } unlock_task_sighand(p, &flags); } @@ -560,7 +560,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p) struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); - timer->it_active = 1; + timer->it_status = POSIX_TIMER_ARMED; if (!cpu_timer_enqueue(&base->tqhead, ctmr)) return; @@ -586,7 +586,8 @@ static void cpu_timer_fire(struct k_itimer *timer) { struct cpu_timer *ctmr = &timer->it.cpu; - timer->it_active = 0; + timer->it_status = POSIX_TIMER_DISARMED; + if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, @@ -671,7 +672,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, ret = TIMER_RETRY; } else { cpu_timer_dequeue(ctmr); - timer->it_active = 0; + timer->it_status = POSIX_TIMER_DISARMED; } /* diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index fd321fcc3f6cc6..dd72b8e72697a2 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -272,7 +272,7 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) if (timr->it_interval && timr->it_signal_seq == info->si_sys_private) { timr->kclock->timer_rearm(timr); - timr->it_active = 1; + timr->it_status = POSIX_TIMER_ARMED; timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1LL; ++timr->it_signal_seq; @@ -292,14 +292,17 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) int posix_timer_queue_signal(struct k_itimer *timr) { + enum posix_timer_state state = POSIX_TIMER_DISARMED; int ret, si_private = 0; enum pid_type type; lockdep_assert_held(&timr->it_lock); - timr->it_active = 0; - if (timr->it_interval) + if (timr->it_interval) { + state = POSIX_TIMER_REQUEUE_PENDING; si_private = ++timr->it_signal_seq; + } + timr->it_status = state; type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; ret = send_sigqueue(timr->sigq, timr->it_pid, type, si_private); @@ -367,7 +370,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_signal_seq; - timr->it_active = 1; + timr->it_status = POSIX_TIMER_ARMED; } } @@ -640,10 +643,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) /* interval timer ? */ if (iv) { cur_setting->it_interval = ktime_to_timespec64(iv); - } else if (!timr->it_active) { + } else if (timr->it_status == POSIX_TIMER_DISARMED) { /* * SIGEV_NONE oneshot timers are never queued and therefore - * timr->it_active is always false. The check below + * timr->it_status is always DISARMED. The check below * vs. remaining time will handle this case. * * For all other timers there is nothing to update here, so @@ -888,7 +891,7 @@ int common_timer_set(struct k_itimer *timr, int flags, if (kc->timer_try_to_cancel(timr) < 0) return TIMER_RETRY; - timr->it_active = 0; + timr->it_status = POSIX_TIMER_DISARMED; posix_timer_set_common(timr, new_setting); /* Keep timer disarmed when it_value is zero */ @@ -901,7 +904,8 @@ int common_timer_set(struct k_itimer *timr, int flags, sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); - timr->it_active = !sigev_none; + if (!sigev_none) + timr->it_status = POSIX_TIMER_ARMED; return 0; } @@ -1000,7 +1004,7 @@ int common_timer_del(struct k_itimer *timer) timer->it_interval = 0; if (kc->timer_try_to_cancel(timer) < 0) return TIMER_RETRY; - timer->it_active = 0; + timer->it_status = POSIX_TIMER_DISARMED; return 0; } diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 4784ea65f68537..4d09677e584edd 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -1,6 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ #define TIMER_RETRY 1 +enum posix_timer_state { + POSIX_TIMER_DISARMED, + POSIX_TIMER_ARMED, + POSIX_TIMER_REQUEUE_PENDING, +}; + struct k_clock { int (*clock_getres)(const clockid_t which_clock, struct timespec64 *tp); From b35108a51cf7bab58d7eace1267d7965978bcdb8 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Wed, 30 Oct 2024 17:47:35 +0000 Subject: [PATCH 077/140] jiffies: Define secs_to_jiffies() secs_to_jiffies() is defined in hci_event.c and cannot be reused by other call sites. Hoist it into the core code to allow conversion of the ~1150 usages of msecs_to_jiffies() that either: - use a multiplier value of 1000 or equivalently MSEC_PER_SEC, or - have timeouts that are denominated in seconds (i.e. end in 000) It's implemented as a macro to allow usage in static initializers. This will also allow conversion of yet more sites that use (sec * HZ) directly, and improve their readability. Suggested-by: Michael Kelley Signed-off-by: Easwar Hariharan Signed-off-by: Thomas Gleixner Reviewed-by: Luiz Augusto von Dentz Link: https://lore.kernel.org/all/20241030-open-coded-timeouts-v3-1-9ba123facf88@linux.microsoft.com --- include/linux/jiffies.h | 13 +++++++++++++ net/bluetooth/hci_event.c | 2 -- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 5d21dacd62bc7d..ed945f42e064af 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -526,6 +526,19 @@ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) } } +/** + * secs_to_jiffies: - convert seconds to jiffies + * @_secs: time in seconds + * + * Conversion is done by simple multiplication with HZ + * + * secs_to_jiffies() is defined as a macro rather than a static inline + * function so it can be used in static initializers. + * + * Return: jiffies value + */ +#define secs_to_jiffies(_secs) ((_secs) * HZ) + extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) static inline unsigned long _usecs_to_jiffies(const unsigned int u) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1c82dcdf6e8fc7..4bd94d432bcf01 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -42,8 +42,6 @@ #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" -#define secs_to_jiffies(_secs) msecs_to_jiffies((_secs) * 1000) - /* Handle HCI Event packets */ static void *hci_ev_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, From 17a8945f369ce2de2532ba8abdb93bb5b2d1c118 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:42 +0100 Subject: [PATCH 078/140] clockevents: Improve clockevents_notify_released() comment When a new clockevent device is added and replaces a previous device, the latter is put into the released list. Then the released list is added back. This may look counter-intuitive but the reason is that released device might be suitable for other uses. For example a released CPU regular clockevent can be a better replacement for the current broadcast event. Similarly a released broadcast clockevent can be a better replacement for the current regular clockevent of a given CPU. Improve comments stating about these subtleties. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-2-frederic@kernel.org --- kernel/time/clockevents.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 78c7bd64d0ddf7..4af27994db93c3 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -337,13 +337,21 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, } /* - * Called after a notify add to make devices available which were - * released from the notifier call. + * Called after a clockevent has been added which might + * have replaced a current regular or broadcast device. A + * released normal device might be a suitable replacement + * for the current broadcast device. Similarly a released + * broadcast device might be a suitable replacement for a + * normal device. */ static void clockevents_notify_released(void) { struct clock_event_device *dev; + /* + * Keep iterating as long as tick_check_new_device() + * replaces a device. + */ while (!list_empty(&clockevents_released)) { dev = list_entry(clockevents_released.next, struct clock_event_device, list); From 3b1596a21fbf210f5b763fd3c0be280650475b52 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:43 +0100 Subject: [PATCH 079/140] clockevents: Shutdown and unregister current clockevents at CPUHP_AP_TICK_DYING The way the clockevent devices are finally stopped while a CPU is offlining is currently chaotic. The layout being by order: 1) tick_sched_timer_dying() stops the tick and the underlying clockevent but only for oneshot case. The periodic tick and its related clockevent still runs. 2) tick_broadcast_offline() detaches and stops the per-cpu oneshot broadcast and append it to the released list. 3) Some individual clockevent drivers stop the clockevents (a second time if the tick is oneshot) 4) Once the CPU is dead, a control CPU remotely detaches and stops (a 3rd time if oneshot mode) the CPU clockevent and adds it to the released list. 5) The released list containing the broadcast device released on step 2) and the remotely detached clockevent from step 4) are unregistered. These random events can be factorized if the current clockevent is detached and stopped by the dying CPU at the generic layer, that is from the dying CPU: a) Stop the tick b) Stop/detach the underlying per-cpu oneshot broadcast clockevent c) Stop/detach the underlying clockevent d) Release / unregister the clockevents from b) and c) e) Release / unregister the remaining clockevents from the dying CPU. This part could be performed by the dying CPU This way the drivers and the tick layer don't need to care about clockevent operations during cpuhotplug down. This also unifies the tick behaviour on offline CPUs between oneshot and periodic modes, avoiding offline ticks altogether for sanity. Adopt the simplification. [ tglx: Remove the WARN_ON() in clockevents_register_device() as that is called from an upcoming CPU before the CPU is marked online ] Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-3-frederic@kernel.org --- include/linux/tick.h | 2 -- kernel/cpu.c | 2 -- kernel/time/clockevents.c | 30 +++++++++++------------------- kernel/time/tick-internal.h | 3 +-- 4 files changed, 12 insertions(+), 25 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index 72744638c5b0fd..b0c74bfe0600f8 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -20,12 +20,10 @@ extern void __init tick_init(void); extern void tick_suspend_local(void); /* Should be core only, but XEN resume magic and ARM BL switcher require it */ extern void tick_resume_local(void); -extern void tick_cleanup_dead_cpu(int cpu); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } static inline void tick_suspend_local(void) { } static inline void tick_resume_local(void) { } -static inline void tick_cleanup_dead_cpu(int cpu) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_HOTPLUG_CPU) diff --git a/kernel/cpu.c b/kernel/cpu.c index d293d52a3e00e1..895f3287e3f3fb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1338,8 +1338,6 @@ static int takedown_cpu(unsigned int cpu) cpuhp_bp_sync_dead(cpu); - tick_cleanup_dead_cpu(cpu); - /* * Callbacks must be re-integrated right away to the RCU state machine. * Otherwise an RCU callback could block a further teardown function diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 4af27994db93c3..f3e831f62906f1 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -618,39 +618,30 @@ void clockevents_resume(void) #ifdef CONFIG_HOTPLUG_CPU -# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST /** - * tick_offline_cpu - Take CPU out of the broadcast mechanism + * tick_offline_cpu - Shutdown all clock events related + * to this CPU and take it out of the + * broadcast mechanism. * @cpu: The outgoing CPU * - * Called on the outgoing CPU after it took itself offline. + * Called by the dying CPU during teardown. */ void tick_offline_cpu(unsigned int cpu) -{ - raw_spin_lock(&clockevents_lock); - tick_broadcast_offline(cpu); - raw_spin_unlock(&clockevents_lock); -} -# endif - -/** - * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu - * @cpu: The dead CPU - */ -void tick_cleanup_dead_cpu(int cpu) { struct clock_event_device *dev, *tmp; - unsigned long flags; - raw_spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock(&clockevents_lock); + tick_broadcast_offline(cpu); tick_shutdown(cpu); + /* * Unregister the clock event devices which were - * released from the users in the notify chain. + * released above. */ list_for_each_entry_safe(dev, tmp, &clockevents_released, list) list_del(&dev->list); + /* * Now check whether the CPU has left unused per cpu devices */ @@ -662,7 +653,8 @@ void tick_cleanup_dead_cpu(int cpu) list_del(&dev->list); } } - raw_spin_unlock_irqrestore(&clockevents_lock, flags); + + raw_spin_unlock(&clockevents_lock); } #endif diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 5f2105e637bdf0..faac36de35b9ef 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -25,6 +25,7 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_offline_cpu(unsigned int cpu); extern void tick_shutdown(unsigned int cpu); extern void tick_suspend(void); extern void tick_resume(void); @@ -142,10 +143,8 @@ static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_ #endif /* !(BROADCAST && ONESHOT) */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) -extern void tick_offline_cpu(unsigned int cpu); extern void tick_broadcast_offline(unsigned int cpu); #else -static inline void tick_offline_cpu(unsigned int cpu) { } static inline void tick_broadcast_offline(unsigned int cpu) { } #endif From a6347864d97506a021c469dad35875088edc03fc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:44 +0100 Subject: [PATCH 080/140] tick: Remove now unneeded low-res tick stop on CPUHP_AP_TICK_DYING The generic clockevent layer now detaches and stops the underlying clockevent from the dying CPU, unifying the tick behaviour for both periodic and oneshot mode on offline CPUs. There is no more need for the tick layer to care about that. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-4-frederic@kernel.org --- kernel/time/tick-sched.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 753a184c70907a..9f90c7333b1dcb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -311,14 +311,6 @@ static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) return HRTIMER_RESTART; } -static void tick_sched_timer_cancel(struct tick_sched *ts) -{ - if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) - hrtimer_cancel(&ts->sched_timer); - else if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) - tick_program_event(KTIME_MAX, 1); -} - #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); @@ -1055,7 +1047,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { - tick_sched_timer_cancel(ts); + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) + hrtimer_cancel(&ts->sched_timer); + else + tick_program_event(KTIME_MAX, 1); return; } @@ -1604,21 +1599,13 @@ void tick_setup_sched_timer(bool hrtimer) */ void tick_sched_timer_dying(int cpu) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - struct clock_event_device *dev = td->evtdev; ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; /* This must happen before hrtimers are migrated! */ - tick_sched_timer_cancel(ts); - - /* - * If the clockevents doesn't support CLOCK_EVT_STATE_ONESHOT_STOPPED, - * make sure not to call low-res tick handler. - */ - if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) - dev->event_handler = clockevents_handle_noop; + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) + hrtimer_cancel(&ts->sched_timer); idle_sleeptime = ts->idle_sleeptime; iowait_sleeptime = ts->iowait_sleeptime; From 900053d9eedfc3f731e59a27d24da938907f5407 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:45 +0100 Subject: [PATCH 081/140] ARM: smp_twd: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-5-frederic@kernel.org --- arch/arm/kernel/smp_twd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c index 9a14f721a2b064..42a3706e16a6aa 100644 --- a/arch/arm/kernel/smp_twd.c +++ b/arch/arm/kernel/smp_twd.c @@ -93,7 +93,6 @@ static void twd_timer_stop(void) { struct clock_event_device *clk = raw_cpu_ptr(twd_evt); - twd_shutdown(clk); disable_percpu_irq(clk->irq); } From 78b5c2ca5f27534dc04fbbe0b491dd3bd4ec814b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:46 +0100 Subject: [PATCH 082/140] clocksource/drivers/arm_arch_timer: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-6-frederic@kernel.org --- drivers/clocksource/arm_arch_timer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 03733101e23174..2bba81e25aa2d8 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -1179,8 +1179,6 @@ static void arch_timer_stop(struct clock_event_device *clk) disable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi]); if (arch_timer_has_nonsecure_ppi()) disable_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]); - - clk->set_state_shutdown(clk); } static int arch_timer_dying_cpu(unsigned int cpu) From 15b810e0496eba62ca5a70d1545d1e4757c0a1ee Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:47 +0100 Subject: [PATCH 083/140] clocksource/drivers/arm_global_timer: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-7-frederic@kernel.org --- drivers/clocksource/arm_global_timer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index a05cfaab5f843e..2d86bbc2764a04 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -195,7 +195,6 @@ static int gt_dying_cpu(unsigned int cpu) { struct clock_event_device *clk = this_cpu_ptr(gt_evt); - gt_clockevent_shutdown(clk); disable_percpu_irq(clk->irq); return 0; } From ba23b6c7f97428dc5dd1898edbae397f1a524b13 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:48 +0100 Subject: [PATCH 084/140] clocksource/drivers/exynos_mct: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-8-frederic@kernel.org --- drivers/clocksource/exynos_mct.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index ef8cb1b71be4a8..e6a02e351d771b 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -496,7 +496,6 @@ static int exynos4_mct_dying_cpu(unsigned int cpu) per_cpu_ptr(&percpu_mct_tick, cpu); struct clock_event_device *evt = &mevt->evt; - evt->set_state_shutdown(evt); if (mct_int_type == MCT_INT_SPI) { if (evt->irq != -1) disable_irq_nosync(evt->irq); From 30f8c70a85bcb756b9247c27fff5f0fabf6d5c6e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:49 +0100 Subject: [PATCH 085/140] clocksource/drivers/armada-370-xp: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-9-frederic@kernel.org --- drivers/clocksource/timer-armada-370-xp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/clocksource/timer-armada-370-xp.c b/drivers/clocksource/timer-armada-370-xp.c index 6ec565d6939ad4..54284c1c065148 100644 --- a/drivers/clocksource/timer-armada-370-xp.c +++ b/drivers/clocksource/timer-armada-370-xp.c @@ -201,7 +201,6 @@ static int armada_370_xp_timer_dying_cpu(unsigned int cpu) { struct clock_event_device *evt = per_cpu_ptr(armada_370_xp_evt, cpu); - evt->set_state_shutdown(evt); disable_percpu_irq(evt->irq); return 0; } From cd165ce8314f8b91b171c1f0d4cf144c0f88f757 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:50 +0100 Subject: [PATCH 086/140] clocksource/drivers/qcom: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-10-frederic@kernel.org --- drivers/clocksource/timer-qcom.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/clocksource/timer-qcom.c b/drivers/clocksource/timer-qcom.c index eac4c95c6127f2..ddb1debe6a6b76 100644 --- a/drivers/clocksource/timer-qcom.c +++ b/drivers/clocksource/timer-qcom.c @@ -130,7 +130,6 @@ static int msm_local_timer_dying_cpu(unsigned int cpu) { struct clock_event_device *evt = per_cpu_ptr(msm_evt, cpu); - evt->set_state_shutdown(evt); disable_percpu_irq(evt->irq); return 0; } From bf9a001fb8e46a23c43d4964523963e717d9e972 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:51 +0100 Subject: [PATCH 087/140] clocksource/drivers/timer-tegra: Remove clockevents shutdown call on offlining The clockevents core already detached and unregistered it at this stage. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-11-frederic@kernel.org --- drivers/clocksource/timer-tegra.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/clocksource/timer-tegra.c b/drivers/clocksource/timer-tegra.c index e9635c25eef4df..35b6ce9deffa6d 100644 --- a/drivers/clocksource/timer-tegra.c +++ b/drivers/clocksource/timer-tegra.c @@ -158,7 +158,6 @@ static int tegra_timer_stop(unsigned int cpu) { struct timer_of *to = per_cpu_ptr(&tegra_to, cpu); - to->clkevt.set_state_shutdown(&to->clkevt); disable_irq_nosync(to->clkevt.irq); return 0; From 1d4199cbbe95efaba51304cfd844bd0ccd224e61 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Oct 2024 08:53:51 +0100 Subject: [PATCH 088/140] timers: Add missing READ_ONCE() in __run_timer_base() __run_timer_base() checks base::next_expiry without holding base::lock. That can race with a remote CPU updating next_expiry under the lock. This is an intentional and harmless data race, but lacks a READ_ONCE(), so KCSAN complains about this. Add the missing READ_ONCE(). All other places are covered already. Fixes: 79f8b28e85f8 ("timers: Annotate possible non critical data race of next_expiry") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/87a5emyqk0.ffs@tglx Closes: https://lore.kernel.org/oe-lkp/202410301205.ef8e9743-lkp@intel.com --- kernel/time/timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 02355b275bab25..a283e524835dfb 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2421,7 +2421,8 @@ static inline void __run_timers(struct timer_base *base) static void __run_timer_base(struct timer_base *base) { - if (time_before(jiffies, base->next_expiry)) + /* Can race against a remote CPU updating next_expiry under the lock */ + if (time_before(jiffies, READ_ONCE(base->next_expiry))) return; timer_base_lock_expiry(base); From d44d26987bb3df6d76556827097fc9ce17565cb8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Oct 2024 13:04:07 +0100 Subject: [PATCH 089/140] timekeeping: Remove CONFIG_DEBUG_TIMEKEEPING Since 135225a363ae timekeeping_cycles_to_ns() handles large offsets which would lead to 64bit multiplication overflows correctly. It's also protected against negative motion of the clocksource unconditionally, which was exclusive to x86 before. timekeeping_advance() handles large offsets already correctly. That means the value of CONFIG_DEBUG_TIMEKEEPING which analyzed these cases is very close to zero. Remove all of it. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241031120328.536010148@linutronix.de --- arch/riscv/configs/defconfig | 1 - include/linux/timekeeper_internal.h | 16 --- kernel/time/timekeeping.c | 108 +----------------- lib/Kconfig.debug | 13 --- .../selftests/wireguard/qemu/debug.config | 1 - 5 files changed, 3 insertions(+), 136 deletions(-) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 2341393cfac1ae..26c01b9e3434c4 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -301,7 +301,6 @@ CONFIG_DEBUG_MEMORY_INIT=y CONFIG_DEBUG_PER_CPU_MAPS=y CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_WQ_WATCHDOG=y -CONFIG_DEBUG_TIMEKEEPING=y CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index a3b6380a777720..e39d4d563b1975 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -76,9 +76,6 @@ struct tk_read_base { * ntp shifted nano seconds. * @ntp_err_mult: Multiplication factor for scaled math conversion * @skip_second_overflow: Flag used to avoid updating NTP twice with same second - * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) - * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) - * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what * we need to add to xtime (or xtime corrected for sub jiffy times) @@ -147,19 +144,6 @@ struct timekeeper { u32 ntp_error_shift; u32 ntp_err_mult; u32 skip_second_overflow; - -#ifdef CONFIG_DEBUG_TIMEKEEPING - long last_warning; - /* - * These simple flag variables are managed - * without locks, which is racy, but they are - * ok since we don't really care about being - * super precise about how many events were - * seen, just that a problem was observed. - */ - int underflow_seen; - int overflow_seen; -#endif }; #ifdef CONFIG_GENERIC_TIME_VSYSCALL diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 17cae886ca8269..d115adebc418c7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -226,97 +226,6 @@ static inline u64 tk_clock_read(const struct tk_read_base *tkr) return clock->read(clock); } -#ifdef CONFIG_DEBUG_TIMEKEEPING -#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ - -static void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ - - u64 max_cycles = tk->tkr_mono.clock->max_cycles; - const char *name = tk->tkr_mono.clock->name; - - if (offset > max_cycles) { - printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", - offset, name, max_cycles); - printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); - } else { - if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", - offset, name, max_cycles >> 1); - printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); - } - } - - if (tk->underflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->underflow_seen = 0; - } - - if (tk->overflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->overflow_seen = 0; - } -} - -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles); - -static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr) -{ - struct timekeeper *tk = &tk_core.timekeeper; - u64 now, last, mask, max, delta; - unsigned int seq; - - /* - * Since we're called holding a seqcount, the data may shift - * under us while we're doing the calculation. This can cause - * false positives, since we'd note a problem but throw the - * results away. So nest another seqcount here to atomically - * grab the points we are checking with. - */ - do { - seq = read_seqcount_begin(&tk_core.seq); - now = tk_clock_read(tkr); - last = tkr->cycle_last; - mask = tkr->mask; - max = tkr->clock->max_cycles; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - delta = clocksource_delta(now, last, mask); - - /* - * Try to catch underflows by checking if we are seeing small - * mask-relative negative values. - */ - if (unlikely((~delta & mask) < (mask >> 3))) - tk->underflow_seen = 1; - - /* Check for multiplication overflows */ - if (unlikely(delta > max)) - tk->overflow_seen = 1; - - /* timekeeping_cycles_to_ns() handles both under and overflow */ - return timekeeping_cycles_to_ns(tkr, now); -} -#else -static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ -} -static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr) -{ - BUG(); -} -#endif - /** * tk_setup_internals - Set up internals to use clocksource clock. * @@ -421,19 +330,11 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } -static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr) +static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } -static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) -{ - if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING)) - return timekeeping_debug_get_ns(tkr); - - return __timekeeping_get_ns(tkr); -} - /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update @@ -477,7 +378,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += __timekeeping_get_ns(tkr); + now += timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return now; @@ -593,7 +494,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) tkr = tkf->base + (seq & 0x01); basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); - delta = __timekeeping_get_ns(tkr); + delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) @@ -2333,9 +2234,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; - /* Do some additional sanity checking */ - timekeeping_check_update(tk, offset); - /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7315f643817ae1..14977b9fc254ef 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1328,19 +1328,6 @@ config SCHEDSTATS endmenu -config DEBUG_TIMEKEEPING - bool "Enable extra timekeeping sanity checking" - help - This option will enable additional timekeeping sanity checks - which may be helpful when diagnosing issues where timekeeping - problems are suspected. - - This may include checks in the timekeeping hotpaths, so this - option may have a (very small) performance impact to some - workloads. - - If unsure, say N. - config DEBUG_PREEMPT bool "Debug preemptible kernel" depends on DEBUG_KERNEL && PREEMPTION && TRACE_IRQFLAGS_SUPPORT diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index 9d172210e2c63f..139fd9aa8b1218 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -31,7 +31,6 @@ CONFIG_SCHED_DEBUG=y CONFIG_SCHED_INFO=y CONFIG_SCHEDSTATS=y CONFIG_SCHED_STACK_END_CHECK=y -CONFIG_DEBUG_TIMEKEEPING=y CONFIG_DEBUG_PREEMPT=y CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_SPINLOCK=y From c163e40af9b2331b2c629fd4ec8b703ed4d4ae39 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Oct 2024 13:04:08 +0100 Subject: [PATCH 090/140] timekeeping: Always check for negative motion clocksource_delta() has two variants. One with a check for negative motion, which is only selected by x86. This is a historic leftover as this function was previously used in the time getter hot paths. Since 135225a363ae timekeeping_cycles_to_ns() has unconditional protection against this as a by-product of the protection against 64bit math overflow. clocksource_delta() is only used in the clocksource watchdog and in timekeeping_advance(). The extra conditional there is not hurting anyone. Remove the config option and unconditionally prevent negative motion of the readout. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241031120328.599430157@linutronix.de --- arch/x86/Kconfig | 1 - kernel/time/Kconfig | 5 ----- kernel/time/timekeeping_internal.h | 7 ------- 3 files changed, 13 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2852fcd82cbd8c..53a5eda8219c2e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -145,7 +145,6 @@ config X86 select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT select CLKEVT_I8253 - select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG # Word-size accesses may read uninitialized data past the trailing \0 # in strings and cause false KMSAN reports. diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8ebb6d5a106bea..b0b97a60aaa6fc 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -17,11 +17,6 @@ config ARCH_CLOCKSOURCE_DATA config ARCH_CLOCKSOURCE_INIT bool -# Clocksources require validation of the clocksource against the last -# cycle update - x86/TSC misfeature -config CLOCKSOURCE_VALIDATE_LAST_CYCLE - bool - # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index b3dca834f48ca8..63e600e943a767 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -30,7 +30,6 @@ static inline void timekeeping_inc_mg_floor_swaps(void) #endif -#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) { u64 ret = (now - last) & mask; @@ -41,12 +40,6 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) */ return ret & ~(mask >> 1) ? 0 : ret; } -#else -static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) -{ - return (now - last) & mask; -} -#endif /* Semi public for serialization of non timekeeper VDSO updates. */ unsigned long timekeeper_lock_irqsave(void); From 15cbfb92efee5c7f09e531a331e19759dbe0ac3c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:29 +0100 Subject: [PATCH 091/140] posix-cpu-timers: Correctly update timer status in posix_cpu_timer_del() If posix_cpu_timer_del() exits early due to task not found or sighand invalid, it fails to clear the state of the timer. That's harmless but inconsistent. These early exits are accounted as successful delete. Move the update of the timer state into the success return path, so all "successful" deletions are handled. Reported-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241105064212.974053438@linutronix.de --- kernel/time/posix-cpu-timers.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 12f828d704b1b8..5f444e3724649b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -493,20 +493,20 @@ static int posix_cpu_timer_del(struct k_itimer *timer) */ WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node)); } else { - if (timer->it.cpu.firing) { + if (timer->it.cpu.firing) ret = TIMER_RETRY; - } else { + else disarm_timer(timer, p); - timer->it_status = POSIX_TIMER_DISARMED; - } unlock_task_sighand(p, &flags); } out: rcu_read_unlock(); - if (!ret) - put_pid(ctmr->pid); + if (!ret) { + put_pid(ctmr->pid); + timer->it_status = POSIX_TIMER_DISARMED; + } return ret; } From 513793bc6ab331b947111e8efaf8fcef33fb83e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:31 +0100 Subject: [PATCH 092/140] posix-timers: Make signal delivery consistent Signals of timers which are reprogammed, disarmed or deleted can deliver signals related to the past. The POSIX spec is blury about this: - "The effect of disarming or resetting a timer with pending expiration notifications is unspecified." - "The disposition of pending signals for the deleted timer is unspecified." In both cases it is reasonable to expect that pending signals are discarded. Especially in the reprogramming case it does not make sense to account for previous overruns or to deliver a signal for a timer which has been disarmed. This makes the behaviour consistent and understandable. Remove the si_sys_private check from the signal delivery code and invoke posix_timer_deliver_signal() unconditionally for posix timer related signals. Change posix_timer_deliver_signal() so it controls the actual signal delivery via the return value. It now instructs the signal code to drop the signal when: 1) The timer does not longer exist in the hash table 2) The timer signal_seq value is not the same as the si_sys_private value which was set when the signal was queued. This is also a preparatory change to embed the sigqueue into the k_itimer structure, which in turn allows to remove the si_sys_private magic. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241105064213.040348644@linutronix.de --- include/linux/posix-timers.h | 2 -- kernel/signal.c | 6 ++---- kernel/time/posix-cpu-timers.c | 2 +- kernel/time/posix-timers.c | 28 ++++++++++++++++------------ 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 02afbb4da7f783..8c6d97412526b0 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -137,8 +137,6 @@ static inline void clear_posix_cputimers_work(struct task_struct *p) { } static inline void posix_cputimers_init_work(void) { } #endif -#define REQUEUE_PENDING 1 - /** * struct k_itimer - POSIX.1b interval timer structure. * @list: List head for binding the timer to signals->posix_timers diff --git a/kernel/signal.c b/kernel/signal.c index df34aa47181e6c..68e6bc70ccf2cf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -550,10 +550,8 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i list_del_init(&first->list); copy_siginfo(info, &first->info); - *resched_timer = - (first->flags & SIGQUEUE_PREALLOC) && - (info->si_code == SI_TIMER) && - (info->si_sys_private); + *resched_timer = (first->flags & SIGQUEUE_PREALLOC) && + (info->si_code == SI_TIMER); __sigqueue_free(first); } else { diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5f444e3724649b..4305c003c8d4b3 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -746,7 +746,7 @@ static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *i * - Timers which expired, but the signal has not yet been * delivered */ - if (iv && ((timer->it_signal_seq & REQUEUE_PENDING) || sigev_none)) + if (iv && timer->it_status != POSIX_TIMER_ARMED) expires = bump_cpu_timer(timer, now); else expires = cpu_timer_getexpires(&timer->it.cpu); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd72b8e72697a2..b380e25d49474d 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -269,7 +269,10 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) if (!timr) goto out; - if (timr->it_interval && timr->it_signal_seq == info->si_sys_private) { + if (timr->it_signal_seq != info->si_sys_private) + goto out_unlock; + + if (timr->it_interval && !WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) { timr->kclock->timer_rearm(timr); timr->it_status = POSIX_TIMER_ARMED; @@ -281,6 +284,7 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) } ret = true; +out_unlock: unlock_timer(timr, flags); out: spin_lock(¤t->sighand->siglock); @@ -293,19 +297,18 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) int posix_timer_queue_signal(struct k_itimer *timr) { enum posix_timer_state state = POSIX_TIMER_DISARMED; - int ret, si_private = 0; enum pid_type type; + int ret; lockdep_assert_held(&timr->it_lock); - if (timr->it_interval) { + if (timr->it_interval) state = POSIX_TIMER_REQUEUE_PENDING; - si_private = ++timr->it_signal_seq; - } + timr->it_status = state; type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; - ret = send_sigqueue(timr->sigq, timr->it_pid, type, si_private); + ret = send_sigqueue(timr->sigq, timr->it_pid, type, timr->it_signal_seq); /* If we failed to send the signal the timer stops. */ return ret > 0; } @@ -663,7 +666,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) * is a SIGEV_NONE timer move the expiry time forward by intervals, * so expiry is > now. */ - if (iv && (timr->it_signal_seq & REQUEUE_PENDING || sig_none)) + if (iv && timr->it_status != POSIX_TIMER_ARMED) timr->it_overrun += kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); @@ -863,8 +866,6 @@ void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_set else timer->it_interval = 0; - /* Prevent reloading in case there is a signal pending */ - timer->it_signal_seq = (timer->it_signal_seq + 2) & ~REQUEUE_PENDING; /* Reset overrun accounting */ timer->it_overrun_last = 0; timer->it_overrun = -1LL; @@ -882,8 +883,6 @@ int common_timer_set(struct k_itimer *timr, int flags, if (old_setting) common_timer_get(timr, old_setting); - /* Prevent rearming by clearing the interval */ - timr->it_interval = 0; /* * Careful here. On SMP systems the timer expiry function could be * active and spinning on timr->it_lock. @@ -933,6 +932,9 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, if (old_spec64) old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; @@ -1001,7 +1003,6 @@ int common_timer_del(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; - timer->it_interval = 0; if (kc->timer_try_to_cancel(timer) < 0) return TIMER_RETRY; timer->it_status = POSIX_TIMER_DISARMED; @@ -1012,6 +1013,9 @@ static inline int timer_delete_hook(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; + /* Prevent signal delivery and rearming. */ + timer->it_signal_seq++; + if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; return kc->timer_del(timer); From b06b0345fff3678517acd0f1837d52477ba30944 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:32 +0100 Subject: [PATCH 093/140] posix-timers: Make signal overrun accounting sensible The handling of the timer overrun in the signal code is inconsistent as it takes previous overruns into account. This is just wrong as after the reprogramming of a timer the overrun count starts over from a clean state, i.e. 0. Don't touch info::si_overrun in send_sigqueue() and only store the overrun value at signal delivery time, which is computed from the timer itself relative to the expiry time. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.106738193@linutronix.de --- kernel/signal.c | 6 ------ kernel/time/posix-timers.c | 11 ++++++----- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 68e6bc70ccf2cf..ba7159b25d514f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1968,15 +1968,9 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type, int s ret = 0; if (unlikely(!list_empty(&q->list))) { - /* - * If an SI_TIMER entry is already queue just increment - * the overrun count. - */ - q->info.si_overrun++; result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } - q->info.si_overrun = 0; signalfd_notify(t, sig); pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b380e25d49474d..66ed49efc02f44 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -233,11 +233,12 @@ __initcall(init_posix_timers); * The siginfo si_overrun field and the return value of timer_getoverrun(2) * are of type int. Clamp the overrun value to INT_MAX */ -static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) +static inline int timer_overrun_to_int(struct k_itimer *timr) { - s64 sum = timr->it_overrun_last + (s64)baseval; + if (timr->it_overrun_last > (s64)INT_MAX) + return INT_MAX; - return sum > (s64)INT_MAX ? INT_MAX : (int)sum; + return (int)timr->it_overrun_last; } static void common_hrtimer_rearm(struct k_itimer *timr) @@ -280,7 +281,7 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) timr->it_overrun = -1LL; ++timr->it_signal_seq; - info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); + info->si_overrun = timer_overrun_to_int(timr); } ret = true; @@ -774,7 +775,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) if (!timr) return -EINVAL; - overrun = timer_overrun_to_int(timr, 0); + overrun = timer_overrun_to_int(timr); unlock_timer(timr, flags); return overrun; From bf635681c906ad056d1fda325de8d1c12c9f8201 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:33 +0100 Subject: [PATCH 094/140] posix-cpu-timers: Cleanup the firing logic The firing flag of a posix CPU timer is tristate: 0: when the timer is not about to deliver a signal 1: when the timer has expired, but the signal has not been delivered yet -1: when the timer was queued for signal delivery and a rearm operation raced against it and supressed the signal delivery. This is a pointless exercise as this can be simply expressed with a boolean. Only if set, the signal is delivered. This makes delete and rearm consistent with the rest of the posix timers. Convert firing to bool and fixup the usage sites accordingly and add comments why the timer cannot be dequeued right away. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241105064213.172848618@linutronix.de --- include/linux/posix-timers.h | 2 +- kernel/time/posix-cpu-timers.c | 34 ++++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 8c6d97412526b0..b1de21731a082f 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -49,7 +49,7 @@ struct cpu_timer { struct timerqueue_head *head; struct pid *pid; struct list_head elist; - int firing; + bool firing; struct task_struct __rcu *handling; }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 4305c003c8d4b3..a282a3c0060543 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -493,10 +493,18 @@ static int posix_cpu_timer_del(struct k_itimer *timer) */ WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node)); } else { - if (timer->it.cpu.firing) + if (timer->it.cpu.firing) { + /* + * Prevent signal delivery. The timer cannot be dequeued + * because it is on the firing list which is not protected + * by sighand->lock. The delivery path is waiting for + * the timer lock. So go back, unlock and retry. + */ + timer->it.cpu.firing = false; ret = TIMER_RETRY; - else + } else { disarm_timer(timer, p); + } unlock_task_sighand(p, &flags); } @@ -668,7 +676,13 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, old_expires = cpu_timer_getexpires(ctmr); if (unlikely(timer->it.cpu.firing)) { - timer->it.cpu.firing = -1; + /* + * Prevent signal delivery. The timer cannot be dequeued + * because it is on the firing list which is not protected + * by sighand->lock. The delivery path is waiting for + * the timer lock. So go back, unlock and retry. + */ + timer->it.cpu.firing = false; ret = TIMER_RETRY; } else { cpu_timer_dequeue(ctmr); @@ -809,7 +823,7 @@ static u64 collect_timerqueue(struct timerqueue_head *head, if (++i == MAX_COLLECTED || now < expires) return expires; - ctmr->firing = 1; + ctmr->firing = true; /* See posix_cpu_timer_wait_running() */ rcu_assign_pointer(ctmr->handling, current); cpu_timer_dequeue(ctmr); @@ -1364,7 +1378,7 @@ static void handle_posix_cpu_timers(struct task_struct *tsk) * timer call will interfere. */ list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { - int cpu_firing; + bool cpu_firing; /* * spin_lock() is sufficient here even independent of the @@ -1376,13 +1390,13 @@ static void handle_posix_cpu_timers(struct task_struct *tsk) spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; - timer->it.cpu.firing = 0; + timer->it.cpu.firing = false; /* - * The firing flag is -1 if we collided with a reset - * of the timer, which already reported this - * almost-firing as an overrun. So don't generate an event. + * If the firing flag is cleared then this raced with a + * timer rearm/delete operation. So don't generate an + * event. */ - if (likely(cpu_firing >= 0)) + if (likely(cpu_firing)) cpu_timer_fire(timer); /* See posix_cpu_timer_wait_running() */ rcu_assign_pointer(timer->it.cpu.handling, NULL); From 4cf7bf2a2f1a8ace4a49a1138c8123fdb5990093 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:35 +0100 Subject: [PATCH 095/140] posix-cpu-timers: Use dedicated flag for CPU timer nanosleep POSIX CPU timer nanosleep creates a k_itimer on stack and uses the sigq pointer to detect the nanosleep case in the expiry function. Prepare for embedding sigqueue into struct k_itimer by using a dedicated flag for nanosleep. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.238550394@linutronix.de --- include/linux/posix-timers.h | 2 ++ kernel/time/posix-cpu-timers.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index b1de21731a082f..bcd01208d79551 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -42,6 +42,7 @@ static inline int clockid_to_fd(const clockid_t clk) * @pid: Pointer to target task PID * @elist: List head for the expiry list * @firing: Timer is currently firing + * @nanosleep: Timer is used for nanosleep and is not a regular posix-timer * @handling: Pointer to the task which handles expiry */ struct cpu_timer { @@ -50,6 +51,7 @@ struct cpu_timer { struct pid *pid; struct list_head elist; bool firing; + bool nanosleep; struct task_struct __rcu *handling; }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a282a3c0060543..0c441d8c26047b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -596,7 +596,7 @@ static void cpu_timer_fire(struct k_itimer *timer) timer->it_status = POSIX_TIMER_DISARMED; - if (unlikely(timer->sigq == NULL)) { + if (unlikely(ctmr->nanosleep)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. @@ -1493,6 +1493,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, timer.it_overrun = -1; error = posix_cpu_timer_create(&timer); timer.it_process = current; + timer.it.cpu.nanosleep = true; if (!error) { static struct itimerspec64 zero_it; From 5d916a0988eed5217c103932ff4887c9ae83c89c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:36 +0100 Subject: [PATCH 096/140] posix-timers: Add a refcount to struct k_itimer To cure the SIG_IGN handling for posix interval timers, the preallocated sigqueue needs to be embedded into struct k_itimer to prevent life time races of all sorts. To make that work correctly it needs reference counting so that timer deletion does not free the timer prematuraly when there is a signal queued or delivered concurrently. Add a rcuref to the posix timer part. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.304756440@linutronix.de --- include/linux/posix-timers.h | 14 ++++++++++++++ kernel/time/posix-timers.c | 7 ++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index bcd01208d79551..9740fd0c2933f8 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -6,11 +6,13 @@ #include #include #include +#include #include #include struct kernel_siginfo; struct task_struct; +struct k_itimer; static inline clockid_t make_process_cpuclock(const unsigned int pid, const clockid_t clock) @@ -105,6 +107,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_deliver_signal(struct kernel_siginfo *info); +void posixtimer_free_timer(struct k_itimer *timer); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ @@ -129,6 +132,7 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } static inline void posixtimer_rearm_itimer(struct task_struct *p) { } static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info) { return false; } +static inline void posixtimer_free_timer(struct k_itimer *timer) { } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK @@ -156,6 +160,7 @@ static inline void posix_cputimers_init_work(void) { } * @it_signal: Pointer to the creators signal struct * @it_pid: The pid of the process/task targeted by the signal * @it_process: The task to wakeup on clock_nanosleep (CPU timers) + * @rcuref: Reference count for life time management * @sigq: Pointer to preallocated sigqueue * @it: Union representing the various posix timer type * internals. @@ -180,6 +185,7 @@ struct k_itimer { struct task_struct *it_process; }; struct sigqueue *sigq; + rcuref_t rcuref; union { struct { struct hrtimer timer; @@ -200,4 +206,12 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); +#ifdef CONFIG_POSIX_TIMERS +static inline void posixtimer_putref(struct k_itimer *tmr) +{ + if (rcuref_put(&tmr->rcuref)) + posixtimer_free_timer(tmr); +} +#endif /* !CONFIG_POSIX_TIMERS */ + #endif diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 66ed49efc02f44..53bd3c4de92c5a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -417,10 +417,11 @@ static struct k_itimer * alloc_posix_timer(void) return NULL; } clear_siginfo(&tmr->sigq->info); + rcuref_init(&tmr->rcuref, 1); return tmr; } -static void posix_timer_free(struct k_itimer *tmr) +void posixtimer_free_timer(struct k_itimer *tmr) { put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); @@ -432,7 +433,7 @@ static void posix_timer_unhash_and_free(struct k_itimer *tmr) spin_lock(&hash_lock); hlist_del_rcu(&tmr->t_hash); spin_unlock(&hash_lock); - posix_timer_free(tmr); + posixtimer_putref(tmr); } static int common_timer_create(struct k_itimer *new_timer) @@ -467,7 +468,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, */ new_timer_id = posix_timer_add(new_timer); if (new_timer_id < 0) { - posix_timer_free(new_timer); + posixtimer_free_timer(new_timer); return new_timer_id; } From 5cac427f7971b0619ebbfc131ef81fcf229c3c01 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:38 +0100 Subject: [PATCH 097/140] signal: Split up __sigqueue_alloc() To cure the SIG_IGN handling for posix interval timers, the preallocated sigqueue needs to be embedded into struct k_itimer to prevent life time races of all sorts. Reorganize __sigqueue_alloc() so the ucounts retrieval and the initialization can be used independently. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.371410037@linutronix.de --- kernel/signal.c | 52 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index ba7159b25d514f..dbd42471cf0306 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -396,16 +396,9 @@ void task_join_group_stop(struct task_struct *task) task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } -/* - * allocate a new signal queue record - * - this may be called without locks if and only if t == current, otherwise an - * appropriate lock must be held to stop the target task from exiting - */ -static struct sigqueue * -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, - int override_rlimit, const unsigned int sigqueue_flags) +static struct ucounts *sig_get_ucounts(struct task_struct *t, int sig, + int override_rlimit) { - struct sigqueue *q = NULL; struct ucounts *ucounts; long sigpending; @@ -424,19 +417,44 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, if (!sigpending) return NULL; - if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { - q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); - } else { + if (unlikely(!override_rlimit && sigpending > task_rlimit(t, RLIMIT_SIGPENDING))) { + dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); print_dropped_signal(sig); + return NULL; } - if (unlikely(q == NULL)) { + return ucounts; +} + +static void __sigqueue_init(struct sigqueue *q, struct ucounts *ucounts, + const unsigned int sigqueue_flags) +{ + INIT_LIST_HEAD(&q->list); + q->flags = sigqueue_flags; + q->ucounts = ucounts; +} + +/* + * allocate a new signal queue record + * - this may be called without locks if and only if t == current, otherwise an + * appropriate lock must be held to stop the target task from exiting + */ +static struct sigqueue *__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, + int override_rlimit, const unsigned int sigqueue_flags) +{ + struct ucounts *ucounts = sig_get_ucounts(t, sig, override_rlimit); + struct sigqueue *q; + + if (!ucounts) + return NULL; + + q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); + if (!q) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); - } else { - INIT_LIST_HEAD(&q->list); - q->flags = sigqueue_flags; - q->ucounts = ucounts; + return NULL; } + + __sigqueue_init(q, ucounts, sigqueue_flags); return q; } From 54f1dd642fd088ba969206f09e7afffad7d9db2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:39 +0100 Subject: [PATCH 098/140] signal: Provide posixtimer_sigqueue_init() To cure the SIG_IGN handling for posix interval timers, the preallocated sigqueue needs to be embedded into struct k_itimer to prevent life time races of all sorts. Provide a new function to initialize the embedded sigqueue to prepare for that. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.450427515@linutronix.de --- include/linux/posix-timers.h | 2 ++ kernel/signal.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 9740fd0c2933f8..200098d27cc040 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -12,6 +12,7 @@ struct kernel_siginfo; struct task_struct; +struct sigqueue; struct k_itimer; static inline clockid_t make_process_cpuclock(const unsigned int pid, @@ -106,6 +107,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, } void posixtimer_rearm_itimer(struct task_struct *p); +bool posixtimer_init_sigqueue(struct sigqueue *q); bool posixtimer_deliver_signal(struct kernel_siginfo *info); void posixtimer_free_timer(struct k_itimer *timer); diff --git a/kernel/signal.c b/kernel/signal.c index dbd42471cf0306..911ed3ab479e75 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1905,6 +1905,17 @@ void flush_itimer_signals(void) __flush_itimer_signals(&tsk->signal->shared_pending); } +bool posixtimer_init_sigqueue(struct sigqueue *q) +{ + struct ucounts *ucounts = sig_get_ucounts(current, -1, 0); + + if (!ucounts) + return false; + clear_siginfo(&q->info); + __sigqueue_init(q, ucounts, SIGQUEUE_PREALLOC); + return true; +} + struct sigqueue *sigqueue_alloc(void) { return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC); From ef1c5bcd6daa674392bdf89b8ae889aafd73f956 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:41 +0100 Subject: [PATCH 099/140] posix-timers: Store PID type in the timer instead of re-evaluating the signal delivery mode everywhere. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.519086500@linutronix.de --- include/linux/posix-timers.h | 2 ++ kernel/time/posix-timers.c | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 200098d27cc040..947176582de9e5 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -180,6 +181,7 @@ struct k_itimer { s64 it_overrun_last; unsigned int it_signal_seq; int it_sigev_notify; + enum pid_type it_pid_type; ktime_t it_interval; struct signal_struct *it_signal; union { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 53bd3c4de92c5a..f18d64c7cd3bdc 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -298,7 +298,6 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info) int posix_timer_queue_signal(struct k_itimer *timr) { enum posix_timer_state state = POSIX_TIMER_DISARMED; - enum pid_type type; int ret; lockdep_assert_held(&timr->it_lock); @@ -308,8 +307,7 @@ int posix_timer_queue_signal(struct k_itimer *timr) timr->it_status = state; - type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; - ret = send_sigqueue(timr->sigq, timr->it_pid, type, timr->it_signal_seq); + ret = send_sigqueue(timr->sigq, timr->it_pid, timr->it_pid_type, timr->it_signal_seq); /* If we failed to send the signal the timer stops. */ return ret > 0; } @@ -496,6 +494,11 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, new_timer->it_pid = get_pid(task_tgid(current)); } + if (new_timer->it_sigev_notify & SIGEV_THREAD_ID) + new_timer->it_pid_type = PIDTYPE_PID; + else + new_timer->it_pid_type = PIDTYPE_TGID; + new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; From 0360ed14d9826678a50fa2b873e522a24cd3c018 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:42 +0100 Subject: [PATCH 100/140] signal: Refactor send_sigqueue() To handle posix timers which have their signal ignored via SIG_IGN properly it is required to requeue a ignored signal for delivery when SIG_IGN is lifted so the timer gets rearmed. Split the required code out of send_sigqueue() so it can be reused in context of sigaction(). While at it rename send_sigqueue() to posixtimer_send_sigqueue() so its clear what this is about. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.586453412@linutronix.de --- include/linux/posix-timers.h | 1 + include/linux/sched/signal.h | 1 - kernel/signal.c | 82 ++++++++++++++++++++---------------- kernel/time/posix-timers.c | 2 +- 4 files changed, 47 insertions(+), 39 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 947176582de9e5..52611ea923b2c8 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -109,6 +109,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_init_sigqueue(struct sigqueue *q); +int posixtimer_send_sigqueue(struct k_itimer *tmr); bool posixtimer_deliver_signal(struct kernel_siginfo *info); void posixtimer_free_timer(struct k_itimer *timer); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index bd9f569231d98a..36283c1c55e9b3 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -340,7 +340,6 @@ extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type, int si_private); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) diff --git a/kernel/signal.c b/kernel/signal.c index 911ed3ab479e75..5b71e26abb0eb7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1947,40 +1947,54 @@ void sigqueue_free(struct sigqueue *q) __sigqueue_free(q); } -int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type, int si_private) +static void posixtimer_queue_sigqueue(struct sigqueue *q, struct task_struct *t, enum pid_type type) { - int sig = q->info.si_signo; struct sigpending *pending; + int sig = q->info.si_signo; + + signalfd_notify(t, sig); + pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; + list_add_tail(&q->list, &pending->list); + sigaddset(&pending->signal, sig); + complete_signal(sig, t, type); +} + +/* + * This function is used by POSIX timers to deliver a timer signal. + * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID + * set), the signal must be delivered to the specific thread (queues + * into t->pending). + * + * Where type is not PIDTYPE_PID, signals must be delivered to the + * process. In this case, prefer to deliver to current if it is in + * the same thread group as the target process, which avoids + * unnecessarily waking up a potentially idle task. + */ +static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr) +{ + struct task_struct *t = pid_task(tmr->it_pid, tmr->it_pid_type); + + if (t && tmr->it_pid_type != PIDTYPE_PID && same_thread_group(t, current)) + t = current; + return t; +} + +int posixtimer_send_sigqueue(struct k_itimer *tmr) +{ + struct sigqueue *q = tmr->sigq; + int sig = q->info.si_signo; struct task_struct *t; unsigned long flags; int ret, result; - if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC))) - return 0; - if (WARN_ON_ONCE(q->info.si_code != SI_TIMER)) - return 0; - - ret = -1; - rcu_read_lock(); + guard(rcu)(); - /* - * This function is used by POSIX timers to deliver a timer signal. - * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID - * set), the signal must be delivered to the specific thread (queues - * into t->pending). - * - * Where type is not PIDTYPE_PID, signals must be delivered to the - * process. In this case, prefer to deliver to current if it is in - * the same thread group as the target process, which avoids - * unnecessarily waking up a potentially idle task. - */ - t = pid_task(pid, type); + t = posixtimer_get_target(tmr); if (!t) - goto ret; - if (type != PIDTYPE_PID && same_thread_group(t, current)) - t = current; + return -1; + if (!likely(lock_task_sighand(t, &flags))) - goto ret; + return -1; /* * Update @q::info::si_sys_private for posix timer signals with @@ -1988,30 +2002,24 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type, int s * decides based on si_sys_private whether to invoke * posixtimer_rearm() or not. */ - q->info.si_sys_private = si_private; + q->info.si_sys_private = tmr->it_signal_seq; ret = 1; /* the signal is ignored */ - result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, false)) + if (!prepare_signal(sig, t, false)) { + result = TRACE_SIGNAL_IGNORED; goto out; + } ret = 0; if (unlikely(!list_empty(&q->list))) { result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } - - signalfd_notify(t, sig); - pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; - list_add_tail(&q->list, &pending->list); - sigaddset(&pending->signal, sig); - complete_signal(sig, t, type); + posixtimer_queue_sigqueue(q, t, tmr->it_pid_type); result = TRACE_SIGNAL_DELIVERED; out: - trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result); + trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); -ret: - rcu_read_unlock(); return ret; } #endif /* CONFIG_POSIX_TIMERS */ diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index f18d64c7cd3bdc..0901ed9ca18309 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -307,7 +307,7 @@ int posix_timer_queue_signal(struct k_itimer *timr) timr->it_status = state; - ret = send_sigqueue(timr->sigq, timr->it_pid, timr->it_pid_type, timr->it_signal_seq); + ret = posixtimer_send_sigqueue(timr); /* If we failed to send the signal the timer stops. */ return ret > 0; } From 11629b9808e5900d675fd469d19932ea48060de3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:43 +0100 Subject: [PATCH 101/140] signal: Replace resched_timer logic In preparation for handling ignored posix timer signals correctly and embedding the sigqueue struct into struct k_itimer, hand down a pointer to the sigqueue struct into posix_timer_deliver_signal() instead of just having a boolean flag. No functional change. Suggested-by: Eric W. Biederman Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: "Eric W. Biederman" Link: https://lore.kernel.org/all/20241105064213.652658158@linutronix.de --- include/linux/posix-timers.h | 5 +++-- kernel/signal.c | 32 ++++++++++++++++++++------------ kernel/time/posix-timers.c | 2 +- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 52611ea923b2c8..39f1db76833ad9 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -110,7 +110,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_init_sigqueue(struct sigqueue *q); int posixtimer_send_sigqueue(struct k_itimer *tmr); -bool posixtimer_deliver_signal(struct kernel_siginfo *info); +bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); void posixtimer_free_timer(struct k_itimer *timer); /* Init task static initializer */ @@ -135,7 +135,8 @@ static inline void posix_cputimers_init(struct posix_cputimers *pct) { } static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } static inline void posixtimer_rearm_itimer(struct task_struct *p) { } -static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info) { return false; } +static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info, + struct sigqueue *timer_sigq) { return false; } static inline void posixtimer_free_timer(struct k_itimer *timer) { } #endif diff --git a/kernel/signal.c b/kernel/signal.c index 5b71e26abb0eb7..0ddb5dd284aa01 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -545,7 +545,7 @@ bool unhandled_signal(struct task_struct *tsk, int sig) } static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info, - bool *resched_timer) + struct sigqueue **timer_sigq) { struct sigqueue *q, *first = NULL; @@ -568,10 +568,17 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i list_del_init(&first->list); copy_siginfo(info, &first->info); - *resched_timer = (first->flags & SIGQUEUE_PREALLOC) && - (info->si_code == SI_TIMER); - - __sigqueue_free(first); + /* + * posix-timer signals are preallocated and freed when the + * timer goes away. Either directly or by clearing + * SIGQUEUE_PREALLOC so that the next delivery will free + * them. Spare the extra round through __sigqueue_free() + * which is ignoring preallocated signals. + */ + if (unlikely((first->flags & SIGQUEUE_PREALLOC) && (info->si_code == SI_TIMER))) + *timer_sigq = first; + else + __sigqueue_free(first); } else { /* * Ok, it wasn't in the queue. This must be @@ -588,12 +595,12 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - kernel_siginfo_t *info, bool *resched_timer) + kernel_siginfo_t *info, struct sigqueue **timer_sigq) { int sig = next_signal(pending, mask); if (sig) - collect_signal(sig, pending, info, resched_timer); + collect_signal(sig, pending, info, timer_sigq); return sig; } @@ -605,18 +612,19 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) { struct task_struct *tsk = current; - bool resched_timer = false; + struct sigqueue *timer_sigq; int signr; lockdep_assert_held(&tsk->sighand->siglock); again: *type = PIDTYPE_PID; - signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); + timer_sigq = NULL; + signr = __dequeue_signal(&tsk->pending, mask, info, &timer_sigq); if (!signr) { *type = PIDTYPE_TGID; signr = __dequeue_signal(&tsk->signal->shared_pending, - mask, info, &resched_timer); + mask, info, &timer_sigq); if (unlikely(signr == SIGALRM)) posixtimer_rearm_itimer(tsk); @@ -642,8 +650,8 @@ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) current->jobctl |= JOBCTL_STOP_DEQUEUED; } - if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(resched_timer)) { - if (!posixtimer_deliver_signal(info)) + if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(timer_sigq)) { + if (!posixtimer_deliver_signal(info, timer_sigq)) goto again; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0901ed9ca18309..d6fef064b35799 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -254,7 +254,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr) * This function is called from the signal delivery code. It decides * whether the signal should be dropped and rearms interval timers. */ -bool posixtimer_deliver_signal(struct kernel_siginfo *info) +bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { struct k_itimer *timr; unsigned long flags; From 6017a158beb13b412e55a451379798aae5876514 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:45 +0100 Subject: [PATCH 102/140] posix-timers: Embed sigqueue in struct k_itimer To cure the SIG_IGN handling for posix interval timers, the preallocated sigqueue needs to be embedded into struct k_itimer to prevent life time races of all sorts. Now that the prerequisites are in place, embed the sigqueue into struct k_itimer and fixup the relevant usage sites. Aside of preparing for proper SIG_IGN handling, this spares an extra allocation. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.719695194@linutronix.de --- fs/proc/base.c | 4 +- include/linux/posix-timers.h | 23 +++++++++- kernel/signal.c | 19 +++++--- kernel/time/posix-timers.c | 88 +++++++++++++++++++++--------------- 4 files changed, 87 insertions(+), 47 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index b31283d81c52ea..6a37a43241e4a1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2553,8 +2553,8 @@ static int show_timer(struct seq_file *m, void *v) seq_printf(m, "ID: %d\n", timer->it_id); seq_printf(m, "signal: %d/%px\n", - timer->sigq->info.si_signo, - timer->sigq->info.si_value.sival_ptr); + timer->sigq.info.si_signo, + timer->sigq.info.si_value.sival_ptr); seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 39f1db76833ad9..28c0a30e08532c 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -39,6 +39,8 @@ static inline int clockid_to_fd(const clockid_t clk) #ifdef CONFIG_POSIX_TIMERS +#include + /** * cpu_timer - Posix CPU timer representation for k_itimer * @node: timerqueue node to queue in the task/sig @@ -166,7 +168,7 @@ static inline void posix_cputimers_init_work(void) { } * @it_pid: The pid of the process/task targeted by the signal * @it_process: The task to wakeup on clock_nanosleep (CPU timers) * @rcuref: Reference count for life time management - * @sigq: Pointer to preallocated sigqueue + * @sigq: Embedded sigqueue * @it: Union representing the various posix timer type * internals. * @rcu: RCU head for freeing the timer. @@ -190,7 +192,7 @@ struct k_itimer { struct pid *it_pid; struct task_struct *it_process; }; - struct sigqueue *sigq; + struct sigqueue sigq; rcuref_t rcuref; union { struct { @@ -218,6 +220,23 @@ static inline void posixtimer_putref(struct k_itimer *tmr) if (rcuref_put(&tmr->rcuref)) posixtimer_free_timer(tmr); } + +static inline void posixtimer_sigqueue_getref(struct sigqueue *q) +{ + struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); + + WARN_ON_ONCE(!rcuref_get(&tmr->rcuref)); +} + +static inline void posixtimer_sigqueue_putref(struct sigqueue *q) +{ + struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); + + posixtimer_putref(tmr); +} +#else /* CONFIG_POSIX_TIMERS */ +static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { } +static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { } #endif /* !CONFIG_POSIX_TIMERS */ #endif diff --git a/kernel/signal.c b/kernel/signal.c index 0ddb5dd284aa01..2d74cd5841ae7f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -460,8 +460,10 @@ static struct sigqueue *__sigqueue_alloc(int sig, struct task_struct *t, gfp_t g static void __sigqueue_free(struct sigqueue *q) { - if (q->flags & SIGQUEUE_PREALLOC) + if (q->flags & SIGQUEUE_PREALLOC) { + posixtimer_sigqueue_putref(q); return; + } if (q->ucounts) { dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING); q->ucounts = NULL; @@ -569,11 +571,11 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i copy_siginfo(info, &first->info); /* - * posix-timer signals are preallocated and freed when the - * timer goes away. Either directly or by clearing - * SIGQUEUE_PREALLOC so that the next delivery will free - * them. Spare the extra round through __sigqueue_free() - * which is ignoring preallocated signals. + * posix-timer signals are preallocated and freed when the last + * reference count is dropped in posixtimer_deliver_signal() or + * immediately on timer deletion when the signal is not pending. + * Spare the extra round through __sigqueue_free() which is + * ignoring preallocated signals. */ if (unlikely((first->flags & SIGQUEUE_PREALLOC) && (info->si_code == SI_TIMER))) *timer_sigq = first; @@ -1989,7 +1991,7 @@ static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr) int posixtimer_send_sigqueue(struct k_itimer *tmr) { - struct sigqueue *q = tmr->sigq; + struct sigqueue *q = &tmr->sigq; int sig = q->info.si_signo; struct task_struct *t; unsigned long flags; @@ -2020,9 +2022,12 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr) ret = 0; if (unlikely(!list_empty(&q->list))) { + /* This holds a reference count already */ result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } + + posixtimer_sigqueue_getref(q); posixtimer_queue_sigqueue(q, t, tmr->it_pid_type); result = TRACE_SIGNAL_DELIVERED; out: diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d6fef064b35799..2e2c0edcfa97d5 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -250,15 +250,40 @@ static void common_hrtimer_rearm(struct k_itimer *timr) hrtimer_restart(timer); } +static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr) +{ + guard(spinlock)(&timr->it_lock); + + /* + * Check if the timer is still alive or whether it got modified + * since the signal was queued. In either case, don't rearm and + * drop the signal. + */ + if (timr->it_signal_seq != info->si_sys_private || WARN_ON_ONCE(!timr->it_signal)) + return false; + + if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) + return true; + + timr->kclock->timer_rearm(timr); + timr->it_status = POSIX_TIMER_ARMED; + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1LL; + ++timr->it_signal_seq; + info->si_overrun = timer_overrun_to_int(timr); + return true; +} + /* * This function is called from the signal delivery code. It decides - * whether the signal should be dropped and rearms interval timers. + * whether the signal should be dropped and rearms interval timers. The + * timer can be unconditionally accessed as there is a reference held on + * it. */ bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { - struct k_itimer *timr; - unsigned long flags; - bool ret = false; + struct k_itimer *timr = container_of(timer_sigq, struct k_itimer, sigq); + bool ret; /* * Release siglock to ensure proper locking order versus @@ -266,28 +291,11 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *tim */ spin_unlock(¤t->sighand->siglock); - timr = lock_timer(info->si_tid, &flags); - if (!timr) - goto out; - - if (timr->it_signal_seq != info->si_sys_private) - goto out_unlock; - - if (timr->it_interval && !WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) { - timr->kclock->timer_rearm(timr); + ret = __posixtimer_deliver_signal(info, timr); - timr->it_status = POSIX_TIMER_ARMED; - timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1LL; - ++timr->it_signal_seq; - - info->si_overrun = timer_overrun_to_int(timr); - } - ret = true; + /* Drop the reference which was acquired when the signal was queued */ + posixtimer_putref(timr); -out_unlock: - unlock_timer(timr, flags); -out: spin_lock(¤t->sighand->siglock); /* Don't expose the si_sys_private value to userspace */ @@ -404,17 +412,17 @@ static struct pid *good_sigevent(sigevent_t * event) } } -static struct k_itimer * alloc_posix_timer(void) +static struct k_itimer *alloc_posix_timer(void) { struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; - if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { + + if (unlikely(!posixtimer_init_sigqueue(&tmr->sigq))) { kmem_cache_free(posix_timers_cache, tmr); return NULL; } - clear_siginfo(&tmr->sigq->info); rcuref_init(&tmr->rcuref, 1); return tmr; } @@ -422,7 +430,8 @@ static struct k_itimer * alloc_posix_timer(void) void posixtimer_free_timer(struct k_itimer *tmr) { put_pid(tmr->it_pid); - sigqueue_free(tmr->sigq); + if (tmr->sigq.ucounts) + dec_rlimit_put_ucounts(tmr->sigq.ucounts, UCOUNT_RLIMIT_SIGPENDING); kfree_rcu(tmr, rcu); } @@ -484,13 +493,13 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, goto out; } new_timer->it_sigev_notify = event->sigev_notify; - new_timer->sigq->info.si_signo = event->sigev_signo; - new_timer->sigq->info.si_value = event->sigev_value; + new_timer->sigq.info.si_signo = event->sigev_signo; + new_timer->sigq.info.si_value = event->sigev_value; } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; - new_timer->sigq->info.si_signo = SIGALRM; - memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); - new_timer->sigq->info.si_value.sival_int = new_timer->it_id; + new_timer->sigq.info.si_signo = SIGALRM; + memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t)); + new_timer->sigq.info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } @@ -499,8 +508,8 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, else new_timer->it_pid_type = PIDTYPE_TGID; - new_timer->sigq->info.si_tid = new_timer->it_id; - new_timer->sigq->info.si_code = SI_TIMER; + new_timer->sigq.info.si_tid = new_timer->it_id; + new_timer->sigq.info.si_code = SI_TIMER; if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) { error = -EFAULT; @@ -584,7 +593,14 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * 1) Set timr::it_signal to NULL with timr::it_lock held * 2) Release timr::it_lock * 3) Remove from the hash under hash_lock - * 4) Call RCU for removal after the grace period + * 4) Put the reference count. + * + * The reference count might not drop to zero if timr::sigq is + * queued. In that case the signal delivery or flush will put the + * last reference count. + * + * When the reference count reaches zero, the timer is scheduled + * for RCU removal after the grace period. * * Holding rcu_read_lock() accross the lookup ensures that * the timer cannot be freed. From c2a4796a154bb952be1106911841aab2c8c17c4d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:46 +0100 Subject: [PATCH 103/140] signal: Cleanup unused posix-timer leftovers Remove the leftovers of sigqueue preallocation as it's not longer used. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.786506636@linutronix.de --- include/linux/sched/signal.h | 2 -- kernel/signal.c | 39 ++++-------------------------------- 2 files changed, 4 insertions(+), 37 deletions(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 36283c1c55e9b3..02972fd41931e5 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -338,8 +338,6 @@ extern void force_fatal_sig(int); extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); -extern struct sigqueue *sigqueue_alloc(void); -extern void sigqueue_free(struct sigqueue *); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) diff --git a/kernel/signal.c b/kernel/signal.c index 2d74cd5841ae7f..d267a2c5e97785 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -439,8 +439,8 @@ static void __sigqueue_init(struct sigqueue *q, struct ucounts *ucounts, * - this may be called without locks if and only if t == current, otherwise an * appropriate lock must be held to stop the target task from exiting */ -static struct sigqueue *__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, - int override_rlimit, const unsigned int sigqueue_flags) +static struct sigqueue *sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, + int override_rlimit) { struct ucounts *ucounts = sig_get_ucounts(t, sig, override_rlimit); struct sigqueue *q; @@ -454,7 +454,7 @@ static struct sigqueue *__sigqueue_alloc(int sig, struct task_struct *t, gfp_t g return NULL; } - __sigqueue_init(q, ucounts, sigqueue_flags); + __sigqueue_init(q, ucounts, 0); return q; } @@ -1070,7 +1070,7 @@ static int __send_signal_locked(int sig, struct kernel_siginfo *info, else override_rlimit = 0; - q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0); + q = sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); @@ -1926,37 +1926,6 @@ bool posixtimer_init_sigqueue(struct sigqueue *q) return true; } -struct sigqueue *sigqueue_alloc(void) -{ - return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC); -} - -void sigqueue_free(struct sigqueue *q) -{ - spinlock_t *lock = ¤t->sighand->siglock; - unsigned long flags; - - if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC))) - return; - /* - * We must hold ->siglock while testing q->list - * to serialize with collect_signal() or with - * __exit_signal()->flush_sigqueue(). - */ - spin_lock_irqsave(lock, flags); - q->flags &= ~SIGQUEUE_PREALLOC; - /* - * If it is queued it will be freed when dequeued, - * like the "regular" sigqueue. - */ - if (!list_empty(&q->list)) - q = NULL; - spin_unlock_irqrestore(lock, flags); - - if (q) - __sigqueue_free(q); -} - static void posixtimer_queue_sigqueue(struct sigqueue *q, struct task_struct *t, enum pid_type type) { struct sigpending *pending; From 647da5f709f112319c0d51e06f330d8afecb1940 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:48 +0100 Subject: [PATCH 104/140] posix-timers: Move sequence logic into struct k_itimer The posix timer signal handling uses siginfo::si_sys_private for handling the sequence counter check. That indirection is not longer required and the sequence count value at signal queueing time can be stored in struct k_itimer itself. This removes the requirement of treating siginfo::si_sys_private special as it's now always zero as the kernel does not touch it anymore. Suggested-by: Eric W. Biederman Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: "Eric W. Biederman" Link: https://lore.kernel.org/all/20241105064213.852619866@linutronix.de --- include/linux/posix-timers.h | 2 ++ include/uapi/asm-generic/siginfo.h | 2 +- kernel/signal.c | 8 +++----- kernel/time/posix-timers.c | 5 +---- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 28c0a30e08532c..49a89614d90016 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -162,6 +162,7 @@ static inline void posix_cputimers_init_work(void) { } * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_signal_seq: Sequence count to control signal delivery + * @it_sigqueue_seq: The sequence count at the point where the signal was queued * @it_sigev_notify: The notify word of sigevent struct for signal delivery * @it_interval: The interval for periodic timers * @it_signal: Pointer to the creators signal struct @@ -184,6 +185,7 @@ struct k_itimer { s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; + unsigned int it_sigqueue_seq; int it_sigev_notify; enum pid_type it_pid_type; ktime_t it_interval; diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index b7bc545ec3b2ce..5a1ca43b5fc606 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -46,7 +46,7 @@ union __sifields { __kernel_timer_t _tid; /* timer id */ int _overrun; /* overrun count */ sigval_t _sigval; /* same as below */ - int _sys_private; /* not to be passed to user */ + int _sys_private; /* Not used by the kernel. Historic leftover. Always 0. */ } _timer; /* POSIX.1b signals */ diff --git a/kernel/signal.c b/kernel/signal.c index d267a2c5e97785..d2734dc4d74ffd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1976,12 +1976,10 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr) return -1; /* - * Update @q::info::si_sys_private for posix timer signals with - * sighand locked to prevent a race against dequeue_signal() which - * decides based on si_sys_private whether to invoke - * posixtimer_rearm() or not. + * Update @tmr::sigqueue_seq for posix timer signals with sighand + * locked to prevent a race against dequeue_signal(). */ - q->info.si_sys_private = tmr->it_signal_seq; + tmr->it_sigqueue_seq = tmr->it_signal_seq; ret = 1; /* the signal is ignored */ if (!prepare_signal(sig, t, false)) { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 2e2c0edcfa97d5..f20c06d0cf09d0 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -259,7 +259,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it * since the signal was queued. In either case, don't rearm and * drop the signal. */ - if (timr->it_signal_seq != info->si_sys_private || WARN_ON_ONCE(!timr->it_signal)) + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal)) return false; if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) @@ -297,9 +297,6 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *tim posixtimer_putref(timr); spin_lock(¤t->sighand->siglock); - - /* Don't expose the si_sys_private value to userspace */ - info->si_sys_private = 0; return ret; } From 69f032c92cf883ea74a4b69ba3d91317aa6f174e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:49 +0100 Subject: [PATCH 105/140] signal: Provide ignored_posix_timers list To prepare for handling posix timer signals on sigaction(SIG_IGN) properly, add a list to task::signal. This list will be used to queue posix timers so their signal can be requeued when SIG_IGN is lifted later. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.920101900@linutronix.de --- include/linux/sched/signal.h | 1 + init/init_task.c | 5 +++-- kernel/fork.c | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 02972fd41931e5..d5d03d919df811 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -138,6 +138,7 @@ struct signal_struct { /* POSIX.1b Interval Timers */ unsigned int next_posix_timer_id; struct hlist_head posix_timers; + struct hlist_head ignored_posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; diff --git a/init/init_task.c b/init/init_task.c index 136a8231355ab7..e557f622bd9061 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -30,8 +30,9 @@ static struct signal_struct init_signals = { .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS - .posix_timers = HLIST_HEAD_INIT, - .cputimer = { + .posix_timers = HLIST_HEAD_INIT, + .ignored_posix_timers = HLIST_HEAD_INIT, + .cputimer = { .cputime_atomic = INIT_CPUTIME_ATOMIC, }, #endif diff --git a/kernel/fork.c b/kernel/fork.c index 60c0b4868fd499..c2bd8367a850ec 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1864,6 +1864,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS INIT_HLIST_HEAD(&sig->posix_timers); + INIT_HLIST_HEAD(&sig->ignored_posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; #endif From 0e20cd33acc7a173b23900550331ee82a23e9f00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:51 +0100 Subject: [PATCH 106/140] posix-timers: Handle ignored list on delete and exit To handle posix timer signals on sigaction(SIG_IGN) properly, the timers will be queued on a separate ignored list. Add the necessary cleanup code for timer_delete() and exit_itimers(). Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.987530588@linutronix.de --- include/linux/posix-timers.h | 4 +++- kernel/time/posix-timers.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 49a89614d90016..1608b52a44d5c8 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -152,7 +152,8 @@ static inline void posix_cputimers_init_work(void) { } /** * struct k_itimer - POSIX.1b interval timer structure. - * @list: List head for binding the timer to signals->posix_timers + * @list: List node for binding the timer to tsk::signal::posix_timers + * @ignored_list: List node for tracking ignored timers in tsk::signal::ignored_posix_timers * @t_hash: Entry in the posix timer hash table * @it_lock: Lock protecting the timer * @kclock: Pointer to the k_clock struct handling this timer @@ -176,6 +177,7 @@ static inline void posix_cputimers_init_work(void) { } */ struct k_itimer { struct hlist_node list; + struct hlist_node ignored_list; struct hlist_node t_hash; spinlock_t it_lock; const struct k_clock *kclock; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index f20c06d0cf09d0..2b88fb4e937eeb 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1027,6 +1027,18 @@ int common_timer_del(struct k_itimer *timer) return 0; } +/* + * If the deleted timer is on the ignored list, remove it and + * drop the associated reference. + */ +static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr) +{ + if (!hlist_unhashed(&tmr->ignored_list)) { + hlist_del_init(&tmr->ignored_list); + posixtimer_putref(tmr); + } +} + static inline int timer_delete_hook(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; @@ -1059,6 +1071,7 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) spin_lock(¤t->sighand->siglock); hlist_del(&timer->list); + posix_timer_cleanup_ignored(timer); spin_unlock(¤t->sighand->siglock); /* * A concurrent lookup could check timer::it_signal lockless. It @@ -1110,6 +1123,8 @@ static void itimer_delete(struct k_itimer *timer) } hlist_del(&timer->list); + posix_timer_cleanup_ignored(timer); + /* * Setting timer::it_signal to NULL is technically not required * here as nothing can access the timer anymore legitimately via @@ -1142,6 +1157,19 @@ void exit_itimers(struct task_struct *tsk) /* The timers are not longer accessible via tsk::signal */ while (!hlist_empty(&timers)) itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); + + /* + * There should be no timers on the ignored list. itimer_delete() has + * mopped them up. + */ + if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers))) + return; + + hlist_move_list(&tsk->signal->ignored_posix_timers, &timers); + while (!hlist_empty(&timers)) { + posix_timer_cleanup_ignored(hlist_entry(timers.first, struct k_itimer, + ignored_list)); + } } SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, From caf77435dd8a52cb39c602bdf67d35d6f782f553 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:52 +0100 Subject: [PATCH 107/140] signal: Handle ignored signals in do_sigaction(action != SIG_IGN) When a real handler (including SIG_DFL) is installed for a signal, which had previously SIG_IGN set, then the list of ignored posix timers has to be checked for timers which are affected by this change. Add a list walk function which checks for the matching signal number and if found requeues the timers signal, so the timer is rearmed on signal delivery. Rearming the timer right away is not possible because that requires to drop sighand lock. No functional change as the counter part which queues the timers on the ignored list is still missing. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064214.054091076@linutronix.de --- kernel/signal.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/kernel/signal.c b/kernel/signal.c index d2734dc4d74ffd..908b49c594e457 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2002,7 +2002,54 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr) unlock_task_sighand(t, &flags); return ret; } -#endif /* CONFIG_POSIX_TIMERS */ + +static void posixtimer_sig_unignore(struct task_struct *tsk, int sig) +{ + struct hlist_head *head = &tsk->signal->ignored_posix_timers; + struct hlist_node *tmp; + struct k_itimer *tmr; + + if (likely(hlist_empty(head))) + return; + + /* + * Rearming a timer with sighand lock held is not possible due to + * lock ordering vs. tmr::it_lock. Just stick the sigqueue back and + * let the signal delivery path deal with it whether it needs to be + * rearmed or not. This cannot be decided here w/o dropping sighand + * lock and creating a loop retry horror show. + */ + hlist_for_each_entry_safe(tmr, tmp , head, ignored_list) { + struct task_struct *target; + + /* + * tmr::sigq.info.si_signo is immutable, so accessing it + * without holding tmr::it_lock is safe. + */ + if (tmr->sigq.info.si_signo != sig) + continue; + + hlist_del_init(&tmr->ignored_list); + + /* This should never happen and leaks a reference count */ + if (WARN_ON_ONCE(!list_empty(&tmr->sigq.list))) + continue; + + /* + * Get the target for the signal. If target is a thread and + * has exited by now, drop the reference count. + */ + guard(rcu)(); + target = posixtimer_get_target(tmr); + if (target) + posixtimer_queue_sigqueue(&tmr->sigq, target, tmr->it_pid_type); + else + posixtimer_putref(tmr); + } +} +#else /* CONFIG_POSIX_TIMERS */ +static inline void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { } +#endif /* !CONFIG_POSIX_TIMERS */ void do_notify_pidfd(struct task_struct *task) { @@ -4180,6 +4227,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) sigaction_compat_abi(act, oact); if (act) { + bool was_ignored = k->sa.sa_handler == SIG_IGN; + sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); *k = *act; @@ -4200,6 +4249,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) flush_sigqueue_mask(p, &mask, &p->signal->shared_pending); for_each_thread(p, t) flush_sigqueue_mask(p, &mask, &t->pending); + } else if (was_ignored) { + posixtimer_sig_unignore(p, sig); } } From df7a996b4dab03c889fa86d849447b716f07b069 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:54 +0100 Subject: [PATCH 108/140] signal: Queue ignored posixtimers on ignore list Queue posixtimers which have their signal ignored on the ignored list: 1) When the timer fires and the signal has SIG_IGN set 2) When SIG_IGN is installed via sigaction() and a timer signal is already queued This only happens when the signal is for a valid timer, which delivered the signal in periodic mode. One-shot timer signals are correctly dropped. Due to the lock order constraints (sighand::siglock nests inside timer::lock) the signal code cannot access any of the timer fields which are relevant to make this decision, e.g. timer::it_status. This is addressed by establishing a protection scheme which requires to lock both locks on the timer side for modifying decision fields in the timer struct and therefore makes it possible for the signal delivery to evaluate with only sighand:siglock being held: 1) Move the NULLification of timer->it_signal into the sighand::siglock protected section of timer_delete() and check timer::it_signal in the code path which determines whether the signal is dropped or queued on the ignore list. This ensures that a deleted timer cannot be moved onto the ignore list, which would prevent it from being freed on exit() as it is not longer in the process' posix timer list. If the timer got moved to the ignored list before deletion then it is removed from the ignored list under sighand lock in timer_delete(). 2) Provide a new timer::it_sig_periodic flag, which gets set in the signal queue path with both timer and sighand locks held if the timer is actually in periodic mode at expiry time. The ignore list code checks this flag under sighand::siglock and drops the signal when it is not set. If it is set, then the signal is moved to the ignored list independent of the actual state of the timer. When the signal is un-ignored later then the signal is moved back to the signal queue. On signal delivery the posix timer side decides about dropping the signal if the timer was re-armed, dis-armed or deleted based on the signal sequence counter check. If the thread/process exits then not yet delivered signals are discarded which means the reference of the timer containing the sigqueue is dropped and frees the timer. This is way cheaper than requiring all code paths to lock sighand::siglock of the target thread/process on any modification of timer::it_status or going all the way and removing pending signals from the signal queues on every rearm, disarm or delete operation. So the protection scheme here is that on the timer side both timer::lock and sighand::siglock have to be held for modifying timer::it_signal timer::it_sig_periodic which means that on the signal side holding sighand::siglock is enough to evaluate these fields. In posixtimer_deliver_signal() holding timer::lock is sufficient to do the sequence validation against timer::it_signal_seq because a concurrent expiry is waiting on timer::lock to be released. This completes the SIG_IGN handling and such timers are not longer self rearmed which avoids pointless wakeups. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064214.120756416@linutronix.de --- include/linux/posix-timers.h | 2 + kernel/signal.c | 80 +++++++++++++++++++++++++++++++++--- kernel/time/posix-timers.c | 7 +++- 3 files changed, 83 insertions(+), 6 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 1608b52a44d5c8..43ea6e784a257a 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -160,6 +160,7 @@ static inline void posix_cputimers_init_work(void) { } * @it_clock: The posix timer clock id * @it_id: The posix timer id for identifying the timer * @it_status: The status of the timer + * @it_sig_periodic: The periodic status at signal delivery * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_signal_seq: Sequence count to control signal delivery @@ -184,6 +185,7 @@ struct k_itimer { clockid_t it_clock; timer_t it_id; int it_status; + bool it_sig_periodic; s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; diff --git a/kernel/signal.c b/kernel/signal.c index 908b49c594e457..9b098a7a206f39 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -59,6 +59,8 @@ #include #include /* for syscall_get_* */ +#include "time/posix-timers.h" + /* * SLAB caches for signal bits. */ @@ -731,6 +733,16 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) kick_process(t); } +static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q); + +static void sigqueue_free_ignored(struct task_struct *tsk, struct sigqueue *q) +{ + if (likely(!(q->flags & SIGQUEUE_PREALLOC) || q->info.si_code != SI_TIMER)) + __sigqueue_free(q); + else + posixtimer_sig_ignore(tsk, q); +} + /* Remove signals in mask from the pending set and queue. */ static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct sigpending *s) { @@ -747,7 +759,7 @@ static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct si list_for_each_entry_safe(q, n, &s->list, list) { if (sigismember(mask, q->info.si_signo)) { list_del_init(&q->list); - __sigqueue_free(q); + sigqueue_free_ignored(p, q); } } } @@ -1964,7 +1976,7 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr) int sig = q->info.si_signo; struct task_struct *t; unsigned long flags; - int ret, result; + int result; guard(rcu)(); @@ -1981,13 +1993,55 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr) */ tmr->it_sigqueue_seq = tmr->it_signal_seq; - ret = 1; /* the signal is ignored */ + /* + * Set the signal delivery status under sighand lock, so that the + * ignored signal handling can distinguish between a periodic and a + * non-periodic timer. + */ + tmr->it_sig_periodic = tmr->it_status == POSIX_TIMER_REQUEUE_PENDING; + if (!prepare_signal(sig, t, false)) { result = TRACE_SIGNAL_IGNORED; + + /* Paranoia check. Try to survive. */ + if (WARN_ON_ONCE(!list_empty(&q->list))) + goto out; + + /* Periodic timers with SIG_IGN are queued on the ignored list */ + if (tmr->it_sig_periodic) { + /* + * Already queued means the timer was rearmed after + * the previous expiry got it on the ignore list. + * Nothing to do for that case. + */ + if (hlist_unhashed(&tmr->ignored_list)) { + /* + * Take a signal reference and queue it on + * the ignored list. + */ + posixtimer_sigqueue_getref(q); + posixtimer_sig_ignore(t, q); + } + } else if (!hlist_unhashed(&tmr->ignored_list)) { + /* + * Covers the case where a timer was periodic and + * then the signal was ignored. Later it was rearmed + * as oneshot timer. The previous signal is invalid + * now, and this oneshot signal has to be dropped. + * Remove it from the ignored list and drop the + * reference count as the signal is not longer + * queued. + */ + hlist_del_init(&tmr->ignored_list); + posixtimer_putref(tmr); + } goto out; } - ret = 0; + /* This should never happen and leaks a reference count */ + if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list))) + hlist_del_init(&tmr->ignored_list); + if (unlikely(!list_empty(&q->list))) { /* This holds a reference count already */ result = TRACE_SIGNAL_ALREADY_PENDING; @@ -2000,7 +2054,22 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr) out: trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); - return ret; + return 0; +} + +static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) +{ + struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); + + /* + * If the timer is marked deleted already or the signal originates + * from a non-periodic timer, then just drop the reference + * count. Otherwise queue it on the ignored list. + */ + if (tmr->it_signal && tmr->it_sig_periodic) + hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); + else + posixtimer_putref(tmr); } static void posixtimer_sig_unignore(struct task_struct *tsk, int sig) @@ -2048,6 +2117,7 @@ static void posixtimer_sig_unignore(struct task_struct *tsk, int sig) } } #else /* CONFIG_POSIX_TIMERS */ +static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { } static inline void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { } #endif /* !CONFIG_POSIX_TIMERS */ diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 2b88fb4e937eeb..ea72db3c93650d 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1072,12 +1072,17 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) spin_lock(¤t->sighand->siglock); hlist_del(&timer->list); posix_timer_cleanup_ignored(timer); - spin_unlock(¤t->sighand->siglock); /* * A concurrent lookup could check timer::it_signal lockless. It * will reevaluate with timer::it_lock held and observe the NULL. + * + * It must be written with siglock held so that the signal code + * observes timer->it_signal == NULL in do_sigaction(SIG_IGN), + * which prevents it from moving a pending signal of a deleted + * timer to the ignore list. */ WRITE_ONCE(timer->it_signal, NULL); + spin_unlock(¤t->sighand->siglock); unlock_timer(timer, flags); posix_timer_unhash_and_free(timer); From 7a66f72b09bb0762360274b1fb677b3433dbaa06 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:55 +0100 Subject: [PATCH 109/140] posix-timers: Cleanup SIG_IGN workaround leftovers Now that ignored posix timer signals are requeued and the timers are rearmed on signal delivery the workaround to keep such timers alive and self rearm them is not longer required. Remove the relevant hacks and the not longer required return values from the related functions. The alarm timer workarounds will be cleaned up in a separate step. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064214.187239060@linutronix.de --- include/linux/posix-timers.h | 2 +- kernel/signal.c | 7 ++-- kernel/time/alarmtimer.c | 47 +++++----------------- kernel/time/posix-cpu-timers.c | 18 ++------- kernel/time/posix-timers.c | 73 +++------------------------------- kernel/time/posix-timers.h | 2 +- 6 files changed, 24 insertions(+), 125 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 43ea6e784a257a..f11f10c97bd9dc 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -111,7 +111,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_init_sigqueue(struct sigqueue *q); -int posixtimer_send_sigqueue(struct k_itimer *tmr); +void posixtimer_send_sigqueue(struct k_itimer *tmr); bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); void posixtimer_free_timer(struct k_itimer *timer); diff --git a/kernel/signal.c b/kernel/signal.c index 9b098a7a206f39..cbf70c8089699f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1970,7 +1970,7 @@ static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr) return t; } -int posixtimer_send_sigqueue(struct k_itimer *tmr) +void posixtimer_send_sigqueue(struct k_itimer *tmr) { struct sigqueue *q = &tmr->sigq; int sig = q->info.si_signo; @@ -1982,10 +1982,10 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr) t = posixtimer_get_target(tmr); if (!t) - return -1; + return; if (!likely(lock_task_sighand(t, &flags))) - return -1; + return; /* * Update @tmr::sigqueue_seq for posix timer signals with sighand @@ -2054,7 +2054,6 @@ int posixtimer_send_sigqueue(struct k_itimer *tmr) out: trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); - return 0; } static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 452d8aa2f6e013..8543d7f1cdb459 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -197,28 +197,15 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { struct alarm *alarm = container_of(timer, struct alarm, timer); struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - int ret = HRTIMER_NORESTART; - int restart = ALARMTIMER_NORESTART; - spin_lock_irqsave(&base->lock, flags); - alarmtimer_dequeue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard (spinlock_irqsave, &base->lock) + alarmtimer_dequeue(base, alarm); if (alarm->function) - restart = alarm->function(alarm, base->get_ktime()); - - spin_lock_irqsave(&base->lock, flags); - if (restart != ALARMTIMER_NORESTART) { - hrtimer_set_expires(&alarm->timer, alarm->node.expires); - alarmtimer_enqueue(base, alarm); - ret = HRTIMER_RESTART; - } - spin_unlock_irqrestore(&base->lock, flags); + alarm->function(alarm, base->get_ktime()); trace_alarmtimer_fired(alarm, base->get_ktime()); - return ret; - + return HRTIMER_NORESTART; } ktime_t alarm_expires_remaining(const struct alarm *alarm) @@ -567,30 +554,14 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) * * Return: whether the timer is to be restarted */ -static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, - ktime_t now) +static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) { - struct k_itimer *ptr = container_of(alarm, struct k_itimer, - it.alarm.alarmtimer); - enum alarmtimer_restart result = ALARMTIMER_NORESTART; - unsigned long flags; - - spin_lock_irqsave(&ptr->it_lock, flags); + struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); - if (posix_timer_queue_signal(ptr) && ptr->it_interval) { - /* - * Handle ignored signals and rearm the timer. This will go - * away once we handle ignored signals proper. Ensure that - * small intervals cannot starve the system. - */ - ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true); - ++ptr->it_signal_seq; - ptr->it_status = POSIX_TIMER_ARMED; - result = ALARMTIMER_RESTART; - } - spin_unlock_irqrestore(&ptr->it_lock, flags); + guard(spinlock_irqsave)(&ptr->it_lock); + posix_timer_queue_signal(ptr); - return result; + return ALARMTIMER_NORESTART; } /** diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0c441d8c26047b..50e8d04ab661f4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -603,21 +603,11 @@ static void cpu_timer_fire(struct k_itimer *timer) */ wake_up_process(timer->it_process); cpu_timer_setexpires(ctmr, 0); - } else if (!timer->it_interval) { - /* - * One-shot timer. Clear it as soon as it's fired. - */ + } else { posix_timer_queue_signal(timer); - cpu_timer_setexpires(ctmr, 0); - } else if (posix_timer_queue_signal(timer)) { - /* - * The signal did not get queued because the signal - * was ignored, so we won't get any callback to - * reload the timer. But we need to keep it - * ticking in case the signal is deliverable next time. - */ - posix_cpu_timer_rearm(timer); - ++timer->it_signal_seq; + /* Disable oneshot timers */ + if (!timer->it_interval) + cpu_timer_setexpires(ctmr, 0); } } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ea72db3c93650d..881a9ce96af77a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -300,21 +300,12 @@ bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *tim return ret; } -int posix_timer_queue_signal(struct k_itimer *timr) +void posix_timer_queue_signal(struct k_itimer *timr) { - enum posix_timer_state state = POSIX_TIMER_DISARMED; - int ret; - lockdep_assert_held(&timr->it_lock); - if (timr->it_interval) - state = POSIX_TIMER_REQUEUE_PENDING; - - timr->it_status = state; - - ret = posixtimer_send_sigqueue(timr); - /* If we failed to send the signal the timer stops. */ - return ret > 0; + timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; + posixtimer_send_sigqueue(timr); } /* @@ -327,62 +318,10 @@ int posix_timer_queue_signal(struct k_itimer *timr) static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { struct k_itimer *timr = container_of(timer, struct k_itimer, it.real.timer); - enum hrtimer_restart ret = HRTIMER_NORESTART; - unsigned long flags; - - spin_lock_irqsave(&timr->it_lock, flags); - - if (posix_timer_queue_signal(timr)) { - /* - * The signal was not queued due to SIG_IGN. As a - * consequence the timer is not going to be rearmed from - * the signal delivery path. But as a real signal handler - * can be installed later the timer must be rearmed here. - */ - if (timr->it_interval != 0) { - ktime_t now = hrtimer_cb_get_time(timer); - - /* - * FIXME: What we really want, is to stop this - * timer completely and restart it in case the - * SIG_IGN is removed. This is a non trivial - * change to the signal handling code. - * - * For now let timers with an interval less than a - * jiffy expire every jiffy and recheck for a - * valid signal handler. - * - * This avoids interrupt starvation in case of a - * very small interval, which would expire the - * timer immediately again. - * - * Moving now ahead of time by one jiffy tricks - * hrtimer_forward() to expire the timer later, - * while it still maintains the overrun accuracy - * for the price of a slight inconsistency in the - * timer_gettime() case. This is at least better - * than a timer storm. - * - * Only required when high resolution timers are - * enabled as the periodic tick based timers are - * automatically aligned to the next tick. - */ - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) { - ktime_t kj = TICK_NSEC; - - if (timr->it_interval < kj) - now = ktime_add(now, kj); - } - - timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval); - ret = HRTIMER_RESTART; - ++timr->it_signal_seq; - timr->it_status = POSIX_TIMER_ARMED; - } - } - unlock_timer(timr, flags); - return ret; + guard(spinlock_irqsave)(&timr->it_lock); + posix_timer_queue_signal(timr); + return HRTIMER_NORESTART; } static struct pid *good_sigevent(sigevent_t * event) diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 4d09677e584edd..61906f0688c1b8 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -42,7 +42,7 @@ extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; -int posix_timer_queue_signal(struct k_itimer *timr); +void posix_timer_queue_signal(struct k_itimer *timr); void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); int common_timer_set(struct k_itimer *timr, int flags, From 6b0aa145786dab25c6b8e79ad70ac3382c381596 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:56 +0100 Subject: [PATCH 110/140] alarmtimers: Remove the throttle mechanism from alarm_forward_now() Now that ignored posix timer signals are requeued and the timers are rearmed on signal delivery the workaround to keep such timers alive and self rearm them is not longer required. Remove the unused alarm timer parts. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064214.252443020@linutronix.de --- kernel/time/alarmtimer.c | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8543d7f1cdb459..593e7d561fa867 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -467,35 +467,11 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) } EXPORT_SYMBOL_GPL(alarm_forward); -static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle) +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { struct alarm_base *base = &alarm_bases[alarm->type]; - ktime_t now = base->get_ktime(); - - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) { - /* - * Same issue as with posix_timer_fn(). Timers which are - * periodic but the signal is ignored can starve the system - * with a very small interval. The real fix which was - * promised in the context of posix_timer_fn() never - * materialized, but someone should really work on it. - * - * To prevent DOS fake @now to be 1 jiffy out which keeps - * the overrun accounting correct but creates an - * inconsistency vs. timer_gettime(2). - */ - ktime_t kj = NSEC_PER_SEC / HZ; - - if (interval < kj) - now = ktime_add(now, kj); - } - - return alarm_forward(alarm, now, interval); -} -u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) -{ - return __alarm_forward_now(alarm, interval, false); + return alarm_forward(alarm, base->get_ktime(), interval); } EXPORT_SYMBOL_GPL(alarm_forward_now); From 2634303f8773b0c602069887565cd412440be15d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:58 +0100 Subject: [PATCH 111/140] alarmtimers: Remove return value from alarm functions Now that the SIG_IGN problem is solved in the core code, the alarmtimer callbacks do not require a return value anymore. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241105064214.318837272@linutronix.de --- drivers/power/supply/charger-manager.c | 3 +-- fs/timerfd.c | 4 +--- include/linux/alarmtimer.h | 10 ++-------- kernel/time/alarmtimer.c | 16 +++++----------- net/netfilter/xt_IDLETIMER.c | 4 +--- 5 files changed, 10 insertions(+), 27 deletions(-) diff --git a/drivers/power/supply/charger-manager.c b/drivers/power/supply/charger-manager.c index 96f0a7fbf10518..09ec0ecf148639 100644 --- a/drivers/power/supply/charger-manager.c +++ b/drivers/power/supply/charger-manager.c @@ -1412,10 +1412,9 @@ static inline struct charger_desc *cm_get_drv_data(struct platform_device *pdev) return dev_get_platdata(&pdev->dev); } -static enum alarmtimer_restart cm_timer_func(struct alarm *alarm, ktime_t now) +static void cm_timer_func(struct alarm *alarm, ktime_t now) { cm_timer_set = false; - return ALARMTIMER_NORESTART; } static int charger_manager_probe(struct platform_device *pdev) diff --git a/fs/timerfd.c b/fs/timerfd.c index 137523e0bb2113..f10c99ad5c6009 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -79,13 +79,11 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) return HRTIMER_NORESTART; } -static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm, - ktime_t now) +static void timerfd_alarmproc(struct alarm *alarm, ktime_t now) { struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx, t.alarm); timerfd_triggered(ctx); - return ALARMTIMER_NORESTART; } /* diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 05e758b8b89454..3ffa5341dce214 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -20,12 +20,6 @@ enum alarmtimer_type { ALARM_BOOTTIME_FREEZER, }; -enum alarmtimer_restart { - ALARMTIMER_NORESTART, - ALARMTIMER_RESTART, -}; - - #define ALARMTIMER_STATE_INACTIVE 0x00 #define ALARMTIMER_STATE_ENQUEUED 0x01 @@ -42,14 +36,14 @@ enum alarmtimer_restart { struct alarm { struct timerqueue_node node; struct hrtimer timer; - enum alarmtimer_restart (*function)(struct alarm *, ktime_t now); + void (*function)(struct alarm *, ktime_t now); enum alarmtimer_type type; int state; void *data; }; void alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); + void (*function)(struct alarm *, ktime_t)); void alarm_start(struct alarm *alarm, ktime_t start); void alarm_start_relative(struct alarm *alarm, ktime_t start); void alarm_restart(struct alarm *alarm); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 593e7d561fa867..37d2d79daea491 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -321,7 +321,7 @@ static int alarmtimer_resume(struct device *dev) static void __alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); alarm->timer.function = alarmtimer_fired; @@ -337,7 +337,7 @@ __alarm_init(struct alarm *alarm, enum alarmtimer_type type, * @function: callback that is run when the alarm fires */ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, HRTIMER_MODE_ABS); @@ -530,14 +530,12 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) * * Return: whether the timer is to be restarted */ -static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) +static void alarm_handle_timer(struct alarm *alarm, ktime_t now) { struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); guard(spinlock_irqsave)(&ptr->it_lock); posix_timer_queue_signal(ptr); - - return ALARMTIMER_NORESTART; } /** @@ -698,18 +696,14 @@ static int alarm_timer_create(struct k_itimer *new_timer) * @now: time at the timer expiration * * Wakes up the task that set the alarmtimer - * - * Return: ALARMTIMER_NORESTART */ -static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, - ktime_t now) +static void alarmtimer_nsleep_wakeup(struct alarm *alarm, ktime_t now) { struct task_struct *task = alarm->data; alarm->data = NULL; if (task) wake_up_process(task); - return ALARMTIMER_NORESTART; } /** @@ -761,7 +755,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, static void alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid, HRTIMER_MODE_ABS); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index db720efa811d58..5514600586a91d 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -107,14 +107,12 @@ static void idletimer_tg_expired(struct timer_list *t) schedule_work(&timer->work); } -static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm, - ktime_t now) +static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) { struct idletimer_tg *timer = alarm->data; pr_debug("alarm %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); - return ALARMTIMER_NORESTART; } static int idletimer_check_sysfs_name(const char *name, unsigned int size) From fbf920f255315974808ce91d934fe50198294d51 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:15 +0100 Subject: [PATCH 112/140] hrtimers: Add missing hrtimer_init() trace points hrtimer_init*_on_stack() is not covered by tracing when CONFIG_DEBUG_OBJECTS_TIMERS=y. Rework the functions similar to hrtimer_init() and hrtimer_init_sleeper() so that the hrtimer_init() tracepoint is unconditionally available. The rework makes hrtimer_init_sleeper() unused. Delete it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/74528e8abf2bb96e8bee85ffacbf14e15cf89f0d.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 19 +----------- kernel/time/hrtimer.c | 65 +++++++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 47 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index aa1e65ccb61584..5aa9d57528c4b6 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -228,32 +228,15 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); -extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, - enum hrtimer_mode mode); - -#ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void destroy_hrtimer_on_stack(struct hrtimer *timer); #else -static inline void hrtimer_init_on_stack(struct hrtimer *timer, - clockid_t which_clock, - enum hrtimer_mode mode) -{ - hrtimer_init(timer, which_clock, mode); -} - -static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, - clockid_t clock_id, - enum hrtimer_mode mode) -{ - hrtimer_init_sleeper(sl, clock_id, mode); -} - static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 04f7d8a392c397..4b0507cf38ea07 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -417,6 +417,11 @@ static inline void debug_hrtimer_init(struct hrtimer *timer) debug_object_init(timer, &hrtimer_debug_descr); } +static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) +{ + debug_object_init_on_stack(timer, &hrtimer_debug_descr); +} + static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { @@ -428,28 +433,6 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer) debug_object_deactivate(timer, &hrtimer_debug_descr); } -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode); - -void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_object_init_on_stack(timer, &hrtimer_debug_descr); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); - -static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode); - -void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) -{ - debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); - __hrtimer_init_sleeper(sl, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); - void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); @@ -459,6 +442,7 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } +static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } @@ -472,6 +456,13 @@ debug_init(struct hrtimer *timer, clockid_t clockid, trace_hrtimer_init(timer, clockid, mode); } +static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init_on_stack(timer); + trace_hrtimer_init(timer, clockid, mode); +} + static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode) { @@ -1600,6 +1591,23 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); +/** + * hrtimer_init_on_stack - initialize a timer in stack memory + * @timer: The timer to be initialized + * @clock_id: The clock to be used + * @mode: The timer mode + * + * Similar to hrtimer_init(), except that this one must be used if struct hrtimer is in stack + * memory. + */ +void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_init_on_stack(timer, clock_id, mode); + __hrtimer_init(timer, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); + /* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated @@ -1944,7 +1952,7 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because - * hrtimer_init_sleeper() determines the delivery mode on RT so the + * __hrtimer_init_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) @@ -1987,19 +1995,18 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, } /** - * hrtimer_init_sleeper - initialize sleeper to the given clock + * hrtimer_init_sleeper_on_stack - initialize a sleeper in stack memory * @sl: sleeper to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel */ -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, - enum hrtimer_mode mode) +void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) { - debug_init(&sl->timer, clock_id, mode); + debug_init_on_stack(&sl->timer, clock_id, mode); __hrtimer_init_sleeper(sl, clock_id, mode); - } -EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { From 482a483cfe5bafeb5408532321cd607bae127a2b Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:16 +0100 Subject: [PATCH 113/140] drm/i915/request: Remove unnecessary modification of hrtimer:: Function When a request is created, the hrtimer is not initialized and only its 'function' field is set to NULL. The hrtimer is only initialized when the request is enqueued. The point of setting 'function' to NULL is that, it can be used to check whether hrtimer_try_to_cancel() should be called while retiring the request. This "trick" is unnecessary, because hrtimer_try_to_cancel() already does its own check whether the timer is armed. If the timer is not armed, hrtimer_try_to_cancel() returns 0. Fully initialize the timer when the request is created, which allows to make the hrtimer::function field private once all users of hrtimer_init() are converted to hrtimer_setup(), which requires a valid callback function to be set. Because hrtimer_try_to_cancel() returns 0 if the timer is not armed, the logic to check whether to call i915_request_put() remains equivalent. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/50f865045aa672a9730343ad131543da332b1d8d.1730386209.git.namcao@linutronix.de --- drivers/gpu/drm/i915/i915_request.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 519e096c607cd8..8f62cfa23fb768 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -273,11 +273,6 @@ i915_request_active_engine(struct i915_request *rq, return ret; } -static void __rq_init_watchdog(struct i915_request *rq) -{ - rq->watchdog.timer.function = NULL; -} - static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer) { struct i915_request *rq = @@ -294,6 +289,14 @@ static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer) return HRTIMER_NORESTART; } +static void __rq_init_watchdog(struct i915_request *rq) +{ + struct i915_request_watchdog *wdg = &rq->watchdog; + + hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + wdg->timer.function = __rq_watchdog_expired; +} + static void __rq_arm_watchdog(struct i915_request *rq) { struct i915_request_watchdog *wdg = &rq->watchdog; @@ -304,8 +307,6 @@ static void __rq_arm_watchdog(struct i915_request *rq) i915_request_get(rq); - hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - wdg->timer.function = __rq_watchdog_expired; hrtimer_start_range_ns(&wdg->timer, ns_to_ktime(ce->watchdog.timeout_us * NSEC_PER_USEC), @@ -317,7 +318,7 @@ static void __rq_cancel_watchdog(struct i915_request *rq) { struct i915_request_watchdog *wdg = &rq->watchdog; - if (wdg->timer.function && hrtimer_try_to_cancel(&wdg->timer) > 0) + if (hrtimer_try_to_cancel(&wdg->timer) > 0) i915_request_put(rq); } From f6e12766c52dc8e7032fe51d4ef33320b475775e Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:17 +0100 Subject: [PATCH 114/140] KVM: x86/xen: Initialize hrtimer in kvm_xen_init_vcpu() The hrtimer is initialized in the KVM_XEN_VCPU_SET_ATTR ioctl. That caused problem in the past, because the hrtimer can be initialized multiple times, which was fixed by commit af735db31285 ("KVM: x86/xen: Initialize Xen timer only once"). This commit avoids initializing the timer multiple times by checking the field 'function' of struct hrtimer to determine if it has already been initialized. This is not required and in the way to make the function field private. Move the hrtimer initialization into kvm_xen_init_vcpu() so that it will only be initialized once. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Acked-by: Sean Christopherson Link: https://lore.kernel.org/all/9c33c7224d97d08f4fa30d3cc8687981c1d3e953.1730386209.git.namcao@linutronix.de --- arch/x86/kvm/xen.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 622fe24da91064..a909b817b9c0da 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -263,13 +263,6 @@ static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) atomic_set(&vcpu->arch.xen.timer_pending, 0); } -static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) -{ - hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_HARD); - vcpu->arch.xen.timer.function = xen_timer_callback; -} - static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) { struct kvm_vcpu_xen *vx = &v->arch.xen; @@ -1070,9 +1063,6 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } - if (!vcpu->arch.xen.timer.function) - kvm_xen_init_timer(vcpu); - /* Stop the timer (if it's running) before changing the vector */ kvm_xen_stop_timer(vcpu); vcpu->arch.xen.timer_virq = data->u.timer.port; @@ -2235,6 +2225,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.xen.poll_evtchn = 0; timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); + hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + vcpu->arch.xen.timer.function = xen_timer_callback; kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); From 48baf9fa4884e5ccf6ef8fa7099693696ebc6975 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:18 +0100 Subject: [PATCH 115/140] _RESEND_PATCH_v2_04_19_wifi_rt2x00_Remove_redundant_hrtimer_init_ rt2x00usb_probe() executes a hrtimer_init() for txstatus_timer. Afterwards, rt2x00lib_probe_dev() is called which also initializes this txstatus_timer with the same settings. Remove the redundant hrtimer_init() call in rt2x00usb_probe(). Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Acked-by: Kalle Valo Link: https://lore.kernel.org/all/66116057f788e18a6603d50a554417eee459e02c.1730386209.git.namcao@linutronix.de --- drivers/net/wireless/ralink/rt2x00/rt2x00usb.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00usb.c b/drivers/net/wireless/ralink/rt2x00/rt2x00usb.c index 8fd22c69855fa4..a6d50149e0c3eb 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00usb.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00usb.c @@ -823,8 +823,6 @@ int rt2x00usb_probe(struct usb_interface *usb_intf, INIT_WORK(&rt2x00dev->rxdone_work, rt2x00usb_work_rxdone); INIT_WORK(&rt2x00dev->txdone_work, rt2x00usb_work_txdone); - hrtimer_init(&rt2x00dev->txstatus_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); retval = rt2x00usb_alloc_reg(rt2x00dev); if (retval) From c95d36585b9f8c43a4c5d5a9fe22477a138b63f4 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:19 +0100 Subject: [PATCH 116/140] io_uring: Remove redundant hrtimer's callback function setup The IORING_OP_TIMEOUT command uses hrtimer underneath. The timer's callback function is setup in io_timeout(), and then the callback function is setup again when the timer is rearmed. Since the callback function is the same for both cases, the latter setup is redundant, therefore remove it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Reviewed-by: Jens Axboe timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); - data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->timeout_lock); return; From 908a1d775422ba2e27a5e33d0c130b522419e121 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:20 +0100 Subject: [PATCH 117/140] hrtimers: Introduce hrtimer_setup() to replace hrtimer_init() To initialize hrtimer, hrtimer_init() needs to be called and also hrtimer::function must be set. This is error-prone and awkward to use. Introduce hrtimer_setup() which does both of these things, so that users of hrtimer can be simplified. The new setup function also has a sanity check for the provided function pointer. If NULL, a warning is emitted and a dummy callback installed. hrtimer_init() will be removed as soon as all of its users have been converted to the new function. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/5057c1ddbfd4b92033cd93d37fe38e6b069d5ba6.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 2 ++ kernel/time/hrtimer.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5aa9d57528c4b6..bcc0715c59a841 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -228,6 +228,8 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); +extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode); extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4b0507cf38ea07..a5ef67edcda92f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1535,6 +1535,11 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) return HRTIMER_BASE_MONOTONIC; } +static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) +{ + return HRTIMER_NORESTART; +} + static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { @@ -1571,6 +1576,18 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timerqueue_init(&timer->node); } +static void __hrtimer_setup(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) +{ + __hrtimer_init(timer, clock_id, mode); + + if (WARN_ON_ONCE(!function)) + timer->function = hrtimer_dummy_timeout; + else + timer->function = function; +} + /** * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized @@ -1591,6 +1608,27 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); +/** + * hrtimer_setup - initialize a timer to the given clock + * @timer: the timer to be initialized + * @function: the callback function + * @clock_id: the clock to be used + * @mode: The modes which are relevant for initialization: + * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, + * HRTIMER_MODE_REL_SOFT + * + * The PINNED variants of the above can be handed in, + * but the PINNED bit is ignored as pinning happens + * when the hrtimer is started + */ +void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) +{ + debug_init(timer, clock_id, mode); + __hrtimer_setup(timer, function, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_setup); + /** * hrtimer_init_on_stack - initialize a timer in stack memory * @timer: The timer to be initialized From 444cb7db4c9f9b5d96be17c38b3e989df7bfabd5 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:21 +0100 Subject: [PATCH 118/140] hrtimers: Introduce hrtimer_setup_on_stack() To initialize hrtimer on stack, hrtimer_init_on_stack() needs to be called and also hrtimer::function must be set. This is error-prone and awkward to use. Introduce hrtimer_setup_on_stack() which does both of these things, so that users of hrtimer can be simplified. The new setup function also has a sanity check for the provided function pointer. If NULL, a warning is emitted and a dummy callback installed. hrtimer_init_on_stack() will be removed as soon as all of its users have been converted to the new function. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/4b05e2ab3a82c517adf67fabc0f0cd8fe118b97c.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 3 +++ kernel/time/hrtimer.c | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index bcc0715c59a841..2da513f8d66acc 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -232,6 +232,9 @@ extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function clockid_t clock_id, enum hrtimer_mode mode); extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); +extern void hrtimer_setup_on_stack(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode); extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index a5ef67edcda92f..daee4e27f83950 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1646,6 +1646,25 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); +/** + * hrtimer_setup_on_stack - initialize a timer on stack memory + * @timer: The timer to be initialized + * @function: the callback function + * @clock_id: The clock to be used + * @mode: The timer mode + * + * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack + * memory. + */ +void hrtimer_setup_on_stack(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) +{ + debug_init_on_stack(timer, clock_id, mode); + __hrtimer_setup(timer, function, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); + /* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated From c9bd83abfeb9a9b103e689b251ccff7a01be8366 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:22 +0100 Subject: [PATCH 119/140] hrtimers: Introduce hrtimer_setup_sleeper_on_stack() The hrtimer_init*() API is replaced by hrtimer_setup*() variants to initialize the timer including the callback function at once. hrtimer_init_sleeper_on_stack() does not need user to setup the callback function separately, so a new variant would not be strictly necessary. Nonetheless, to keep the naming convention consistent, introduce hrtimer_setup_sleeper_on_stack(). hrtimer_init_on_stack() will be removed once all users are converted. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/7b5e18e6dd0ace9eaa211201528cb9dc23752454.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 2 ++ kernel/time/hrtimer.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2da513f8d66acc..48872a2b40718c 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -238,6 +238,8 @@ extern void hrtimer_setup_on_stack(struct hrtimer *timer, extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); +extern void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void destroy_hrtimer_on_stack(struct hrtimer *timer); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index daee4e27f83950..1d1f5c03673cef 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2065,6 +2065,20 @@ void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); +/** + * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory + * @sl: sleeper to be initialized + * @clock_id: the clock to be used + * @mode: timer mode abs/rel + */ +void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) +{ + debug_init_on_stack(&sl->timer, clock_id, mode); + __hrtimer_init_sleeper(sl, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); + int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { From 8f02e3563bb5824eb01c94f2c75f1dcee2d05625 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:23 +0100 Subject: [PATCH 120/140] hrtimers: Introduce hrtimer_update_function() Some users of hrtimer need to change the callback function after the initial setup. They write to hrtimer::function directly. That's not safe under all circumstances as the write is lockless and a concurrent timer expiry might end up using the wrong function pointer. Introduce hrtimer_update_function(), which also performs runtime checks whether it is safe to modify the callback. This allows to make hrtimer::function private once all users are converted. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20a937b0ae09ad54b5b6d86eabead7c570f1b72e.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 48872a2b40718c..6e026730e80392 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -327,6 +327,28 @@ static inline int hrtimer_callback_running(struct hrtimer *timer) return timer->base->running == timer; } +/** + * hrtimer_update_function - Update the timer's callback function + * @timer: Timer to update + * @function: New callback function + * + * Only safe to call if the timer is not enqueued. Can be called in the callback function if the + * timer is not enqueued at the same time (see the comments above HRTIMER_STATE_ENQUEUED). + */ +static inline void hrtimer_update_function(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *)) +{ + guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock); + + if (WARN_ON_ONCE(hrtimer_is_queued(timer))) + return; + + if (WARN_ON_ONCE(!function)) + return; + + timer->function = function; +} + /* Forward a hrtimer so it expires after now: */ extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); From 28e70352b8069fcebf18466a780e2469f968ea98 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:24 +0100 Subject: [PATCH 121/140] fs/aio: Switch to use hrtimer_setup_sleeper_on_stack() hrtimer_setup_sleeper_on_stack() replaces hrtimer_init_sleeper_on_stack() to keep the naming convention consistent. Convert the usage site over to it. The conversion was done with Coccinelle. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/5f10c259fa43ba2fe774de5b2cedc22f5e9cfd2d.1730386209.git.namcao@linutronix.de --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index e8920178b50f73..a5d331f2994322 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1335,7 +1335,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, if (until == 0 || ret < 0 || ret >= min_nr) return ret; - hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); if (until != KTIME_MAX) { hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns); hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); From 9788c1f0ff120476f58ad53e18098af8249d7e36 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:25 +0100 Subject: [PATCH 122/140] futex: Switch to use hrtimer_setup_sleeper_on_stack() hrtimer_setup_sleeper_on_stack() replaces hrtimer_init_sleeper_on_stack() to keep the naming convention consistent. Convert the usage site over to it. The conversion was done with Coccinelle. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/d92116a17313dee283ebc959869bea80fbf94cdb.1730386209.git.namcao@linutronix.de --- kernel/futex/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 136768ae26375f..fb7214c7a36f73 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -140,9 +140,9 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, if (!time) return NULL; - hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); + hrtimer_setup_sleeper_on_stack(timeout, + (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is * effectively the same as calling hrtimer_set_expires(). From eb688451dcfb7de0fef678a476096d3616228815 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:26 +0100 Subject: [PATCH 123/140] net: pktgen: Switch to use hrtimer_setup_sleeper_on_stack() hrtimer_setup_sleeper_on_stack() replaces hrtimer_init_sleeper_on_stack() to keep the naming convention consistent. Convert the usage site over to it. The conversion was done with Coccinelle. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/c4b40b8fef250b6a325e1b8bd6057005fb3cb660.1730386209.git.namcao@linutronix.de --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 34f68ef74b8f2c..7e23cacbe66e4a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2285,7 +2285,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) s64 remaining; struct hrtimer_sleeper t; - hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_set_expires(&t.timer, spin_until); remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); From 8fae141107d4540a153efa0e2751a6fc12a13679 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:27 +0100 Subject: [PATCH 124/140] timers: Switch to use hrtimer_setup_sleeper_on_stack() hrtimer_setup_sleeper_on_stack() replaces hrtimer_init_sleeper_on_stack() to keep the naming convention consistent. Convert the usage sites over to it. The conversion was done with Coccinelle. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/299c07f0f96af8ab3a7631b47b6ca22b06b20577.1730386209.git.namcao@linutronix.de --- kernel/time/hrtimer.c | 5 ++--- kernel/time/sleep_timeout.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1d1f5c03673cef..69430467a17dbd 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2138,8 +2138,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) struct hrtimer_sleeper t; int ret; - hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, - HRTIMER_MODE_ABS); + hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); @@ -2153,7 +2152,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, struct hrtimer_sleeper t; int ret = 0; - hrtimer_init_sleeper_on_stack(&t, clockid, mode); + hrtimer_setup_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index 3054e5232d208f..dfe939f6e4ecf2 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -208,7 +208,7 @@ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } - hrtimer_init_sleeper_on_stack(&t, clock_id, mode); + hrtimer_setup_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); From 211647e5121e0e0da974bf69a8eb7c9fe57fa3bd Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:28 +0100 Subject: [PATCH 125/140] wait: Switch to use hrtimer_setup_sleeper_on_stack() hrtimer_setup_sleeper_on_stack() replaces hrtimer_init_sleeper_on_stack() to keep the naming convention consistent. Convert the usage site over to it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/fc91182375df81120a88dbe0263267e24d1bf19e.1730386209.git.namcao@linutronix.de --- include/linux/wait.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 8aa3372f21a080..643b7c7bf376a4 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -541,8 +541,8 @@ do { \ int __ret = 0; \ struct hrtimer_sleeper __t; \ \ - hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, \ - HRTIMER_MODE_REL); \ + hrtimer_setup_sleeper_on_stack(&__t, CLOCK_MONOTONIC, \ + HRTIMER_MODE_REL); \ if ((timeout) != KTIME_MAX) { \ hrtimer_set_expires_range_ns(&__t.timer, timeout, \ current->timer_slack_ns); \ From f3bef7aaa6c807b78e8fc6929c3226d3038fe505 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:29 +0100 Subject: [PATCH 126/140] hrtimers: Delete hrtimer_init_sleeper_on_stack() hrtimer_init_sleeper_on_stack() is now unused. Delete it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/52549846635c0b3a2abf82101f539efdabcd9778.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 3 --- kernel/time/hrtimer.c | 14 -------------- 2 files changed, 17 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 6e026730e80392..4e4f04b3c0c20c 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -235,9 +235,6 @@ extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, extern void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode); -extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, - clockid_t clock_id, - enum hrtimer_mode mode); extern void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 69430467a17dbd..376b8182b72e43 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2051,20 +2051,6 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, sl->task = current; } -/** - * hrtimer_init_sleeper_on_stack - initialize a sleeper in stack memory - * @sl: sleeper to be initialized - * @clock_id: the clock to be used - * @mode: timer mode abs/rel - */ -void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) -{ - debug_init_on_stack(&sl->timer, clock_id, mode); - __hrtimer_init_sleeper(sl, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); - /** * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory * @sl: sleeper to be initialized From 46d076af6d640774a7a8bd6ebf130c22913d3bdb Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:30 +0100 Subject: [PATCH 127/140] sched/idle: Switch to use hrtimer_setup_on_stack() hrtimer_setup_on_stack() takes the callback function pointer as argument and initializes the timer completely. Replace hrtimer_init_on_stack() and the open coded initialization of hrtimer::function with the new setup mechanism. The conversion was done with Coccinelle. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/17f9421fed6061df4ad26a4cc91873d2c078cb0f.1730386209.git.namcao@linutronix.de --- kernel/sched/idle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index d2f096bb274c3f..631e428029259a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -399,8 +399,8 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) cpuidle_use_deepest_state(latency_ns); it.done = 0; - hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - it.timer.function = idle_inject_timer_fn; + hrtimer_setup_on_stack(&it.timer, idle_inject_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); hrtimer_start(&it.timer, ns_to_ktime(duration_ns), HRTIMER_MODE_REL_PINNED_HARD); From fc9f59de26afb3b4a33d37f1ba51a441b050afbb Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:31 +0100 Subject: [PATCH 128/140] io_uring: Switch to use hrtimer_setup_on_stack() hrtimer_setup_on_stack() takes the callback function pointer as argument and initializes the timer completely. Replace hrtimer_init_on_stack() and the open coded initialization of hrtimer::function with the new setup mechanism. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/f0d4ac32ec4050710a656cee8385fa4427be33aa.1730386209.git.namcao@linutronix.de --- io_uring/io_uring.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index feb61d68dca68e..0842aa3f60e7a5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2435,13 +2435,14 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, { ktime_t timeout; - hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS); if (iowq->min_timeout) { timeout = ktime_add_ns(iowq->min_timeout, start_time); - iowq->t.function = io_cqring_min_timer_wakeup; + hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); } else { timeout = iowq->timeout; - iowq->t.function = io_cqring_timer_wakeup; + hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); } hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); From d82fadc727501e80cbc733f5990a682c9f46dc5e Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:32 +0100 Subject: [PATCH 129/140] alarmtimer: Switch to use hrtimer_setup() and hrtimer_setup_on_stack() hrtimer_setup() and hrtimer_setup_on_stack() take the callback function pointer as argument and initialize the timer completely. Replace the hrtimer_init*() variants and the open coded initialization of hrtimer::function with the new setup mechanism. Switch to use the new functions. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/2bae912336103405adcdab96b88d3ea0353b4228.1730386209.git.namcao@linutronix.de --- kernel/time/alarmtimer.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 37d2d79daea491..0ddccdff119a01 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -324,7 +324,6 @@ __alarm_init(struct alarm *alarm, enum alarmtimer_type type, void (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); - alarm->timer.function = alarmtimer_fired; alarm->function = function; alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; @@ -339,8 +338,8 @@ __alarm_init(struct alarm *alarm, enum alarmtimer_type type, void alarm_init(struct alarm *alarm, enum alarmtimer_type type, void (*function)(struct alarm *, ktime_t)) { - hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, - HRTIMER_MODE_ABS); + hrtimer_setup(&alarm->timer, alarmtimer_fired, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); __alarm_init(alarm, type, function); } EXPORT_SYMBOL_GPL(alarm_init); @@ -757,8 +756,8 @@ static void alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, void (*function)(struct alarm *, ktime_t)) { - hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid, - HRTIMER_MODE_ABS); + hrtimer_setup_on_stack(&alarm->timer, alarmtimer_fired, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); __alarm_init(alarm, type, function); } From 3c2fb0152175f9f596b40763cdc1378297da60af Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:33 +0100 Subject: [PATCH 130/140] hrtimers: Delete hrtimer_init_on_stack() hrtimer_init_on_stack() is now unused. Delete it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/510ce0d2944c4a382ea51e51d03dcfb73ba0f4f7.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 2 -- kernel/time/hrtimer.c | 17 ----------------- 2 files changed, 19 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 4e4f04b3c0c20c..7ef5f7ef31a911 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -230,8 +230,6 @@ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode); -extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, - enum hrtimer_mode mode); extern void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 376b8182b72e43..55e9ffbcd49ad4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1629,23 +1629,6 @@ void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struc } EXPORT_SYMBOL_GPL(hrtimer_setup); -/** - * hrtimer_init_on_stack - initialize a timer in stack memory - * @timer: The timer to be initialized - * @clock_id: The clock to be used - * @mode: The timer mode - * - * Similar to hrtimer_init(), except that this one must be used if struct hrtimer is in stack - * memory. - */ -void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_init_on_stack(timer, clock_id, mode); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); - /** * hrtimer_setup_on_stack - initialize a timer on stack memory * @timer: The timer to be initialized From 1d58f7f3a1373734b2e86a246edcf1cd39359f3e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 25 Oct 2024 21:31:01 +0100 Subject: [PATCH 131/140] clocksource/drivers/dw_apb: Remove unused dw_apb_clockevent functions dw_apb_clockevent_pause(), dw_apb_clockevent_resume() and dw_apb_clockevent_stop() have been unused since 2021's commit 1b79fc4f2bfd ("x86/apb_timer: Remove driver for deprecated platform") Remove them. (Some of the other clockevent functions are still called by dw_apb_timer_of.c so I guess it is still in use?) Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20241025203101.241709-1-linux@treblig.org Signed-off-by: Daniel Lezcano --- drivers/clocksource/dw_apb_timer.c | 39 ------------------------------ include/linux/dw_apb_timer.h | 3 --- 2 files changed, 42 deletions(-) diff --git a/drivers/clocksource/dw_apb_timer.c b/drivers/clocksource/dw_apb_timer.c index f5f24a95ee820f..3a55ae5fe225ac 100644 --- a/drivers/clocksource/dw_apb_timer.c +++ b/drivers/clocksource/dw_apb_timer.c @@ -68,25 +68,6 @@ static inline void apbt_writel_relaxed(struct dw_apb_timer *timer, u32 val, writel_relaxed(val, timer->base + offs); } -static void apbt_disable_int(struct dw_apb_timer *timer) -{ - u32 ctrl = apbt_readl(timer, APBTMR_N_CONTROL); - - ctrl |= APBTMR_CONTROL_INT; - apbt_writel(timer, ctrl, APBTMR_N_CONTROL); -} - -/** - * dw_apb_clockevent_pause() - stop the clock_event_device from running - * - * @dw_ced: The APB clock to stop generating events. - */ -void dw_apb_clockevent_pause(struct dw_apb_clock_event_device *dw_ced) -{ - disable_irq(dw_ced->timer.irq); - apbt_disable_int(&dw_ced->timer); -} - static void apbt_eoi(struct dw_apb_timer *timer) { apbt_readl_relaxed(timer, APBTMR_N_EOI); @@ -284,26 +265,6 @@ dw_apb_clockevent_init(int cpu, const char *name, unsigned rating, return dw_ced; } -/** - * dw_apb_clockevent_resume() - resume a clock that has been paused. - * - * @dw_ced: The APB clock to resume. - */ -void dw_apb_clockevent_resume(struct dw_apb_clock_event_device *dw_ced) -{ - enable_irq(dw_ced->timer.irq); -} - -/** - * dw_apb_clockevent_stop() - stop the clock_event_device and release the IRQ. - * - * @dw_ced: The APB clock to stop generating the events. - */ -void dw_apb_clockevent_stop(struct dw_apb_clock_event_device *dw_ced) -{ - free_irq(dw_ced->timer.irq, &dw_ced->ced); -} - /** * dw_apb_clockevent_register() - register the clock with the generic layer * diff --git a/include/linux/dw_apb_timer.h b/include/linux/dw_apb_timer.h index 82ebf92239480d..f8811c46b89e7e 100644 --- a/include/linux/dw_apb_timer.h +++ b/include/linux/dw_apb_timer.h @@ -34,9 +34,6 @@ struct dw_apb_clocksource { }; void dw_apb_clockevent_register(struct dw_apb_clock_event_device *dw_ced); -void dw_apb_clockevent_pause(struct dw_apb_clock_event_device *dw_ced); -void dw_apb_clockevent_resume(struct dw_apb_clock_event_device *dw_ced); -void dw_apb_clockevent_stop(struct dw_apb_clock_event_device *dw_ced); struct dw_apb_clock_event_device * dw_apb_clockevent_init(int cpu, const char *name, unsigned rating, From 0309f714a0908e947af1c902cf6a330cb593e75e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 12:23:56 +0100 Subject: [PATCH 132/140] clocksource/drivers:sp804: Make user selectable The sp804 is currently only user selectable if COMPILE_TEST, this was done by commit dfc82faad725 ("clocksource/drivers/sp804: Add COMPILE_TEST to CONFIG_ARM_TIMER_SP804") in order to avoid it being spuriously offered on platforms that won't have the hardware since it's generally only seen on Arm based platforms. This config is overly restrictive, while platforms that rely on the SP804 do select it in their Kconfig there are others such as the Arm fast models which have a SP804 available but currently unused by Linux. Relax the dependency to allow it to be user selectable on arm and arm64 to avoid surprises and in case someone comes up with a use for extra timer hardware. Fixes: dfc82faad725 ("clocksource/drivers/sp804: Add COMPILE_TEST to CONFIG_ARM_TIMER_SP804") Reported-by: Ross Burton Reviewed-by: Sudeep Holla Acked-by: Mark Rutland Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-vexpress-sp804-v3-1-0a2d3f7883e4@kernel.org Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 95dd4660b5b659..d546903dba4f3a 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -400,7 +400,8 @@ config ARM_GT_INITIAL_PRESCALER_VAL This affects CPU_FREQ max delta from the initial frequency. config ARM_TIMER_SP804 - bool "Support for Dual Timer SP804 module" if COMPILE_TEST + bool "Support for Dual Timer SP804 module" + depends on ARM || ARM64 || COMPILE_TEST depends on GENERIC_SCHED_CLOCK && HAVE_CLK select CLKSRC_MMIO select TIMER_OF if OF From 314413317b6d78cc76cd48f0296fde9fcfdec400 Mon Sep 17 00:00:00 2001 From: Judith Mendez Date: Fri, 11 Oct 2024 12:52:03 -0500 Subject: [PATCH 133/140] clocksource/drivers/timer-ti-dm: Don't fail probe if int not found Some timers may not have an interrupt routed to the A53 GIC, but the timer PWM functionality can still be used by Linux Kernel. Therefore, do not fail probe if interrupt is not found and ti,timer-pwm exists. Signed-off-by: Judith Mendez Link: https://lore.kernel.org/r/20241011175203.1040568-1-jm@ti.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index b7a34b1a975ec5..3666d94cc8ddc5 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -1104,8 +1104,12 @@ static int omap_dm_timer_probe(struct platform_device *pdev) return -ENOMEM; timer->irq = platform_get_irq(pdev, 0); - if (timer->irq < 0) - return timer->irq; + if (timer->irq < 0) { + if (of_property_read_bool(dev->of_node, "ti,timer-pwm")) + dev_info(dev, "Did not find timer interrupt, timer usable in PWM mode only\n"); + else + return timer->irq; + } timer->io_base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(timer->io_base)) From dfe101bcad840d025deb5e43150d54050ab7724d Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Sat, 19 Oct 2024 09:10:30 +0200 Subject: [PATCH 134/140] clocksource/drivers/mips-gic-timer: Always use cluster 0 counter as clocksource In a multi-cluster MIPS system, there are multiple GICs - one in each cluster - each of which has its independent counter. The counters in each GIC are not synchronized in any way, so they can drift relative to one another through the lifetime of the system. This is problematic for a clock source which ought to be global. Avoid problems by always accessing cluster 0's counter, using cross-cluster register access. This adds overhead so it is applied only on multi-cluster systems. Signed-off-by: Paul Burton Signed-off-by: Chao-ying Fu Signed-off-by: Dragan Mladjenovic Signed-off-by: Aleksandar Rikalo Tested-by: Serge Semin Acked-by: Thomas Bogendoerfer Tested-by: Gregory CLEMENT Link: https://lore.kernel.org/r/20241019071037.145314-6-arikalo@gmail.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/mips-gic-timer.c | 39 +++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c index 110347707ff980..7907b740497a5c 100644 --- a/drivers/clocksource/mips-gic-timer.c +++ b/drivers/clocksource/mips-gic-timer.c @@ -166,6 +166,37 @@ static u64 gic_hpt_read(struct clocksource *cs) return gic_read_count(); } +static u64 gic_hpt_read_multicluster(struct clocksource *cs) +{ + unsigned int hi, hi2, lo; + u64 count; + + mips_cm_lock_other(0, 0, 0, CM_GCR_Cx_OTHER_BLOCK_GLOBAL); + + if (mips_cm_is64) { + count = read_gic_redir_counter(); + goto out; + } + + hi = read_gic_redir_counter_32h(); + while (true) { + lo = read_gic_redir_counter_32l(); + + /* If hi didn't change then lo didn't wrap & we're done */ + hi2 = read_gic_redir_counter_32h(); + if (hi2 == hi) + break; + + /* Otherwise, repeat with the latest hi value */ + hi = hi2; + } + + count = (((u64)hi) << 32) + lo; +out: + mips_cm_unlock_other(); + return count; +} + static struct clocksource gic_clocksource = { .name = "GIC", .read = gic_hpt_read, @@ -203,6 +234,11 @@ static int __init __gic_clocksource_init(void) gic_clocksource.rating = 200; gic_clocksource.rating += clamp(gic_frequency / 10000000, 0, 99); + if (mips_cps_multicluster_cpus()) { + gic_clocksource.read = &gic_hpt_read_multicluster; + gic_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_NONE; + } + ret = clocksource_register_hz(&gic_clocksource, gic_frequency); if (ret < 0) pr_warn("Unable to register clocksource\n"); @@ -261,7 +297,8 @@ static int __init gic_clocksource_of_init(struct device_node *node) * stable CPU frequency or on the platforms with CM3 and CPU frequency * change performed by the CPC core clocks divider. */ - if (mips_cm_revision() >= CM_REV_CM3 || !IS_ENABLED(CONFIG_CPU_FREQ)) { + if ((mips_cm_revision() >= CM_REV_CM3 || !IS_ENABLED(CONFIG_CPU_FREQ)) && + !mips_cps_multicluster_cpus()) { sched_clock_register(mips_cm_is64 ? gic_read_count_64 : gic_read_count_2x32, gic_count_width, gic_frequency); From cd5375610baadd3a0842a9e83ca502684f938be8 Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Mon, 28 Oct 2024 21:36:43 +0100 Subject: [PATCH 135/140] clocksource/drivers/ralink: Add Ralink System Tick Counter driver System Tick Counter is present on Ralink SoCs RT3352 and MT7620. This driver has been in 'arch/mips/ralink' directory since the beggining of Ralink architecture support. However, it can be moved into a more proper place in 'drivers/clocksource'. Hence add it here adding also support for compile test targets and reducing LOC in architecture code folder. Signed-off-by: Sergio Paracuellos Link: https://lore.kernel.org/r/20241028203643.191268-2-sergio.paracuellos@gmail.com Signed-off-by: Daniel Lezcano --- arch/mips/ralink/Kconfig | 7 ------- arch/mips/ralink/Makefile | 2 -- drivers/clocksource/Kconfig | 9 +++++++++ drivers/clocksource/Makefile | 1 + .../clocksource/timer-ralink.c | 11 ++++------- 5 files changed, 14 insertions(+), 16 deletions(-) rename arch/mips/ralink/cevt-rt3352.c => drivers/clocksource/timer-ralink.c (91%) diff --git a/arch/mips/ralink/Kconfig b/arch/mips/ralink/Kconfig index 08c012a2591f9d..910d059ec70b77 100644 --- a/arch/mips/ralink/Kconfig +++ b/arch/mips/ralink/Kconfig @@ -1,13 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 if RALINK -config CLKEVT_RT3352 - bool - depends on SOC_RT305X || SOC_MT7620 - default y - select TIMER_OF - select CLKSRC_MMIO - config RALINK_ILL_ACC bool depends on SOC_RT305X diff --git a/arch/mips/ralink/Makefile b/arch/mips/ralink/Makefile index 26fabbdea1f1e3..0c109eae195359 100644 --- a/arch/mips/ralink/Makefile +++ b/arch/mips/ralink/Makefile @@ -10,8 +10,6 @@ ifndef CONFIG_MIPS_GIC obj-y += clk.o timer.o endif -obj-$(CONFIG_CLKEVT_RT3352) += cevt-rt3352.o - obj-$(CONFIG_RALINK_ILL_ACC) += ill_acc.o obj-$(CONFIG_IRQ_INTC) += irq.o diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index d546903dba4f3a..487c8525996724 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -754,4 +754,13 @@ config EP93XX_TIMER Enables support for the Cirrus Logic timer block EP93XX. +config RALINK_TIMER + bool "Ralink System Tick Counter" + depends on SOC_RT305X || SOC_MT7620 || COMPILE_TEST + select CLKSRC_MMIO + select TIMER_OF + help + Enables support for system tick counter present on + Ralink SoCs RT3352 and MT7620. + endmenu diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 22743785299eda..43ef16a4efa6a3 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -91,3 +91,4 @@ obj-$(CONFIG_GOLDFISH_TIMER) += timer-goldfish.o obj-$(CONFIG_GXP_TIMER) += timer-gxp.o obj-$(CONFIG_CLKSRC_LOONGSON1_PWM) += timer-loongson1-pwm.o obj-$(CONFIG_EP93XX_TIMER) += timer-ep93xx.o +obj-$(CONFIG_RALINK_TIMER) += timer-ralink.o diff --git a/arch/mips/ralink/cevt-rt3352.c b/drivers/clocksource/timer-ralink.c similarity index 91% rename from arch/mips/ralink/cevt-rt3352.c rename to drivers/clocksource/timer-ralink.c index 269d4877d120e8..6ecdb4228f7637 100644 --- a/arch/mips/ralink/cevt-rt3352.c +++ b/drivers/clocksource/timer-ralink.c @@ -1,7 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. + * Ralink System Tick Counter driver present on RT3352 and MT7620 SoCs. * * Copyright (C) 2013 by John Crispin */ @@ -16,8 +15,6 @@ #include #include -#include - #define SYSTICK_FREQ (50 * 1000) #define SYSTICK_CONFIG 0x00 @@ -40,7 +37,7 @@ static int systick_set_oneshot(struct clock_event_device *evt); static int systick_shutdown(struct clock_event_device *evt); static int systick_next_event(unsigned long delta, - struct clock_event_device *evt) + struct clock_event_device *evt) { struct systick_device *sdev; u32 count; @@ -60,7 +57,7 @@ static void systick_event_handler(struct clock_event_device *dev) static irqreturn_t systick_interrupt(int irq, void *dev_id) { - struct clock_event_device *dev = (struct clock_event_device *) dev_id; + struct clock_event_device *dev = (struct clock_event_device *)dev_id; dev->event_handler(dev); From ae4705e1b1bc4dedceb6b0956509e3eb2fedaaf1 Mon Sep 17 00:00:00 2001 From: Ivaylo Ivanov Date: Sun, 3 Nov 2024 14:35:11 +0200 Subject: [PATCH 136/140] dt-bindings: timer: actions,owl-timer: convert to YAML Convert the Actions Semi Owl timer bindings to DT schema. Changes during conversion: - Add a description - Add "clocks" as a required property, since the driver searches for it - Correct the given example according to owl-s500.dtsi Signed-off-by: Ivaylo Ivanov Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241103123513.2890107-1-ivo.ivanov.ivanov1@gmail.com Signed-off-by: Daniel Lezcano --- .../bindings/timer/actions,owl-timer.txt | 21 ---- .../bindings/timer/actions,owl-timer.yaml | 107 ++++++++++++++++++ MAINTAINERS | 2 +- 3 files changed, 108 insertions(+), 22 deletions(-) delete mode 100644 Documentation/devicetree/bindings/timer/actions,owl-timer.txt create mode 100644 Documentation/devicetree/bindings/timer/actions,owl-timer.yaml diff --git a/Documentation/devicetree/bindings/timer/actions,owl-timer.txt b/Documentation/devicetree/bindings/timer/actions,owl-timer.txt deleted file mode 100644 index 977054f87563ce..00000000000000 --- a/Documentation/devicetree/bindings/timer/actions,owl-timer.txt +++ /dev/null @@ -1,21 +0,0 @@ -Actions Semi Owl Timer - -Required properties: -- compatible : "actions,s500-timer" for S500 - "actions,s700-timer" for S700 - "actions,s900-timer" for S900 -- reg : Offset and length of the register set for the device. -- interrupts : Should contain the interrupts. -- interrupt-names : Valid names are: "2hz0", "2hz1", - "timer0", "timer1", "timer2", "timer3" - See ../resource-names.txt - -Example: - - timer@b0168000 { - compatible = "actions,s500-timer"; - reg = <0xb0168000 0x100>; - interrupts = , - ; - interrupt-names = "timer0", "timer1"; - }; diff --git a/Documentation/devicetree/bindings/timer/actions,owl-timer.yaml b/Documentation/devicetree/bindings/timer/actions,owl-timer.yaml new file mode 100644 index 00000000000000..646c554a390a3a --- /dev/null +++ b/Documentation/devicetree/bindings/timer/actions,owl-timer.yaml @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/actions,owl-timer.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Actions Semi Owl timer + +maintainers: + - Andreas Färber + +description: + Actions Semi Owl SoCs provide 32bit and 2Hz timers. + The 32bit timers support dynamic irq, as well as one-shot mode. + +properties: + compatible: + enum: + - actions,s500-timer + - actions,s700-timer + - actions,s900-timer + + clocks: + maxItems: 1 + + interrupts: + minItems: 1 + maxItems: 6 + + interrupt-names: + minItems: 1 + maxItems: 6 + items: + enum: + - 2hz0 + - 2hz1 + - timer0 + - timer1 + - timer2 + - timer3 + + reg: + maxItems: 1 + +required: + - compatible + - clocks + - interrupts + - interrupt-names + - reg + +allOf: + - if: + properties: + compatible: + contains: + enum: + - actions,s500-timer + then: + properties: + interrupts: + minItems: 4 + maxItems: 4 + interrupt-names: + items: + - const: 2hz0 + - const: 2hz1 + - const: timer0 + - const: timer1 + + - if: + properties: + compatible: + contains: + enum: + - actions,s700-timer + - actions,s900-timer + then: + properties: + interrupts: + minItems: 1 + maxItems: 1 + interrupt-names: + items: + - const: timer1 + +additionalProperties: false + +examples: + - | + #include + #include + soc { + #address-cells = <1>; + #size-cells = <1>; + timer@b0168000 { + compatible = "actions,s500-timer"; + reg = <0xb0168000 0x100>; + clocks = <&hosc>; + interrupts = , + , + , + ; + interrupt-names = "2hz0", "2hz1", "timer0", "timer1"; + }; + }; +... diff --git a/MAINTAINERS b/MAINTAINERS index 2250eb10ece1d7..3a24287712f190 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2004,7 +2004,7 @@ F: Documentation/devicetree/bindings/mmc/owl-mmc.yaml F: Documentation/devicetree/bindings/net/actions,owl-emac.yaml F: Documentation/devicetree/bindings/pinctrl/actions,* F: Documentation/devicetree/bindings/power/actions,owl-sps.txt -F: Documentation/devicetree/bindings/timer/actions,owl-timer.txt +F: Documentation/devicetree/bindings/timer/actions,owl-timer.yaml F: arch/arm/boot/dts/actions/ F: arch/arm/mach-actions/ F: arch/arm64/boot/dts/actions/ From e5cfc0989d9a2849c51c720a16b90b2c061a1aeb Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 31 Oct 2024 13:54:23 +0100 Subject: [PATCH 137/140] clocksource/drivers/timer-ti-dm: Fix child node refcount handling of_find_compatible_node() increments the node's refcount, and it must be decremented again with a call to of_node_put() when the pointer is no longer required to avoid leaking the resource. Instead of adding the missing calls to of_node_put() in all execution paths, use the cleanup attribute for 'arm_timer' by means of the __free() macro, which automatically calls of_node_put() when the variable goes out of scope. Fixes: 25de4ce5ed02 ("clocksource/drivers/timer-ti-dm: Handle dra7 timer wrap errata i940") Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20241031-timer-ti-dm-systimer-of_node_put-v3-1-063ee822b73a@gmail.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm-systimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm-systimer.c b/drivers/clocksource/timer-ti-dm-systimer.c index c2dcd8d68e4587..d1c144d6f328cf 100644 --- a/drivers/clocksource/timer-ti-dm-systimer.c +++ b/drivers/clocksource/timer-ti-dm-systimer.c @@ -686,9 +686,9 @@ subsys_initcall(dmtimer_percpu_timer_startup); static int __init dmtimer_percpu_quirk_init(struct device_node *np, u32 pa) { - struct device_node *arm_timer; + struct device_node *arm_timer __free(device_node) = + of_find_compatible_node(NULL, NULL, "arm,armv7-timer"); - arm_timer = of_find_compatible_node(NULL, NULL, "arm,armv7-timer"); if (of_device_is_available(arm_timer)) { pr_warn_once("ARM architected timer wrap issue i940 detected\n"); return 0; From 5569d7348b4a927eb5a2449ddc175ec7c3930c4d Mon Sep 17 00:00:00 2001 From: Tang Bin Date: Thu, 7 Nov 2024 15:46:19 +0800 Subject: [PATCH 138/140] clocksource/drivers/gpx: Remove redundant casts In the function gxp_timer_init, the 'int' type cast in front of the PTR_ERR() macro is redundant, thus remove it. Signed-off-by: Tang Bin Link: https://lore.kernel.org/r/20241107074619.2714-1-tangbin@cmss.chinamobile.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-gxp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-gxp.c b/drivers/clocksource/timer-gxp.c index 57aa2e2cce53a5..48a73c101eb85a 100644 --- a/drivers/clocksource/timer-gxp.c +++ b/drivers/clocksource/timer-gxp.c @@ -85,7 +85,7 @@ static int __init gxp_timer_init(struct device_node *node) clk = of_clk_get(node, 0); if (IS_ERR(clk)) { - ret = (int)PTR_ERR(clk); + ret = PTR_ERR(clk); pr_err("%pOFn clock not found: %d\n", node, ret); goto err_free; } From 08b97fbd13de79744b31d2b3c8a0ab1a409b94fa Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Mon, 4 Nov 2024 13:05:06 -0600 Subject: [PATCH 139/140] clocksource/drivers/arm_arch_timer: Use of_property_present() for non-boolean properties The use of of_property_read_bool() for non-boolean properties is deprecated in favor of of_property_present() when testing for property presence. Signed-off-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20241104190505.272805-2-robh@kernel.org Signed-off-by: Daniel Lezcano --- drivers/clocksource/arm_arch_timer.c | 2 +- drivers/clocksource/timer-ti-dm-systimer.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 2bba81e25aa2d8..808f259781fd2a 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -1428,7 +1428,7 @@ static int __init arch_timer_of_init(struct device_node *np) arch_timers_present |= ARCH_TIMER_TYPE_CP15; - has_names = of_property_read_bool(np, "interrupt-names"); + has_names = of_property_present(np, "interrupt-names"); for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) { if (has_names) diff --git a/drivers/clocksource/timer-ti-dm-systimer.c b/drivers/clocksource/timer-ti-dm-systimer.c index d1c144d6f328cf..985a6d08512b42 100644 --- a/drivers/clocksource/timer-ti-dm-systimer.c +++ b/drivers/clocksource/timer-ti-dm-systimer.c @@ -202,10 +202,10 @@ static bool __init dmtimer_is_preferred(struct device_node *np) /* Secure gptimer12 is always clocked with a fixed source */ if (!of_property_read_bool(np, "ti,timer-secure")) { - if (!of_property_read_bool(np, "assigned-clocks")) + if (!of_property_present(np, "assigned-clocks")) return false; - if (!of_property_read_bool(np, "assigned-clock-parents")) + if (!of_property_present(np, "assigned-clock-parents")) return false; } From cdc905d16b07981363e53a21853ba1cf6cd8e92a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 17 Nov 2024 00:48:23 +0100 Subject: [PATCH 140/140] posix-timers: Fix spurious warning on double enqueue versus do_exit() A timer sigqueue may find itself already pending when it is tried to be enqueued. This situation can happen if the timer sigqueue is enqueued but then the timer is reset afterwards and fires before the pending signal managed to be delivered. However when such a double enqueue occurs while the corresponding signal is ignored, the sigqueue is expected to be found either on the dedicated ignored list if the timer was periodic or dropped if the timer was one-shot. In any case it is not supposed to be queued on the real signal queue. An assertion verifies the latter expectation on top of the return value of prepare_signal(), assuming "false" means that the signal is being ignored. But prepare_signal() may also fail if the target is exiting as the last task of its group. In this case the double enqueue observes the sigqueue queued, as in such a situation: TASK A (same group as B) TASK B (same group as A) ------------------------ ------------------------ // timer event // queue signal to TASK B posix_timer_queue_signal() // reset timer through syscall do_timer_settime() // exit, leaving task B alone do_exit() do_exit() synchronize_group_exit() signal->flags = SIGNAL_GROUP_EXIT // ========> timer event posix_timer_queue_signal() // return false due to SIGNAL_GROUP_EXIT if (!prepare_signal()) WARN_ON_ONCE(!list_empty(&q->list)) And this spuriously triggers this warning: WARNING: CPU: 0 PID: 5854 at kernel/signal.c:2008 posixtimer_send_sigqueue CPU: 0 UID: 0 PID: 5854 Comm: syz-executor139 Not tainted 6.12.0-rc6-next-20241108-syzkaller #0 RIP: 0010:posixtimer_send_sigqueue+0x9da/0xbc0 kernel/signal.c:2008 Call Trace: alarm_handle_timer alarmtimer_fired __run_hrtimer __hrtimer_run_queues hrtimer_interrupt local_apic_timer_interrupt __sysvec_apic_timer_interrupt instr_sysvec_apic_timer_interrupt sysvec_apic_timer_interrupt Fortunately the recovery code in that case already does the right thing: just exit from posixtimer_send_sigqueue() and wait for __exit_signal() to flush the pending signal. Just make sure to warn only the case when the sigqueue is queued and the signal is really ignored. Fixes: df7a996b4dab ("signal: Queue ignored posixtimers on ignore list") Reported-by: syzbot+852e935b899bde73626e@syzkaller.appspotmail.com Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: syzbot+852e935b899bde73626e@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/20241116234823.28497-1-frederic@kernel.org Closes: https://lore.kernel.org/all/673549c6.050a0220.1324f8.008c.GAE@google.com --- kernel/signal.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index cbf70c8089699f..10b464b9d91f6f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2003,9 +2003,15 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr) if (!prepare_signal(sig, t, false)) { result = TRACE_SIGNAL_IGNORED; - /* Paranoia check. Try to survive. */ - if (WARN_ON_ONCE(!list_empty(&q->list))) + if (!list_empty(&q->list)) { + /* + * If task group is exiting with the signal already pending, + * wait for __exit_signal() to do its job. Otherwise if + * ignored, it's not supposed to be queued. Try to survive. + */ + WARN_ON_ONCE(!(t->signal->flags & SIGNAL_GROUP_EXIT)); goto out; + } /* Periodic timers with SIG_IGN are queued on the ignored list */ if (tmr->it_sig_periodic) {