From 7d26e2d5e2da37e92c6c7644b26b294dedd8c982 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 2 Jul 2009 17:08:30 -0700 Subject: [PATCH 1/6] [CPUFREQ] Eliminate the recent lockdep warnings in cpufreq Commit b14893a62c73af0eca414cfed505b8c09efc613c although it was very much needed to properly cleanup ondemand timer, opened-up a can of worms related to locking dependencies in cpufreq. Patch here defines the need for dbs_mutex and cleans up its usage in ondemand governor. This also resolves the lockdep warnings reported here http://lkml.indiana.edu/hypermail/linux/kernel/0906.1/01925.html http://lkml.indiana.edu/hypermail/linux/kernel/0907.0/00820.html and few others.. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq.c | 4 ++-- drivers/cpufreq/cpufreq_conservative.c | 27 +++++++++++--------------- drivers/cpufreq/cpufreq_ondemand.c | 27 +++++++++++--------------- 3 files changed, 24 insertions(+), 34 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6e2ec0b18948..c7fe16e0474b 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1070,8 +1070,6 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev) spin_unlock_irqrestore(&cpufreq_driver_lock, flags); #endif - unlock_policy_rwsem_write(cpu); - if (cpufreq_driver->target) __cpufreq_governor(data, CPUFREQ_GOV_STOP); @@ -1088,6 +1086,8 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev) if (cpufreq_driver->exit) cpufreq_driver->exit(data); + unlock_policy_rwsem_write(cpu); + free_cpumask_var(data->related_cpus); free_cpumask_var(data->cpus); kfree(data); diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 7fc58af748b4..58889f26029a 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -70,15 +70,10 @@ static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ /* - * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug - * lock and dbs_mutex. cpu_hotplug lock should always be held before - * dbs_mutex. If any function that can potentially take cpu_hotplug lock - * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then - * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock - * is recursive for the same process. -Venki - * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it - * would deadlock with cancel_delayed_work_sync(), which is needed for proper - * raceless workqueue teardown. + * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on + * different CPUs. It protects dbs_enable in governor start/stop. It also + * serializes governor limit_change with do_dbs_timer. We do not want + * do_dbs_timer to run when user is changing the governor or limits. */ static DEFINE_MUTEX(dbs_mutex); @@ -488,18 +483,17 @@ static void do_dbs_timer(struct work_struct *work) delay -= jiffies % delay; - if (lock_policy_rwsem_write(cpu) < 0) - return; + mutex_lock(&dbs_mutex); if (!dbs_info->enable) { - unlock_policy_rwsem_write(cpu); + mutex_unlock(&dbs_mutex); return; } dbs_check_cpu(dbs_info); queue_delayed_work_on(cpu, kconservative_wq, &dbs_info->work, delay); - unlock_policy_rwsem_write(cpu); + mutex_unlock(&dbs_mutex); } static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) @@ -590,15 +584,16 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, &dbs_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } - dbs_timer_init(this_dbs_info); - mutex_unlock(&dbs_mutex); + dbs_timer_init(this_dbs_info); + break; case CPUFREQ_GOV_STOP: - mutex_lock(&dbs_mutex); dbs_timer_exit(this_dbs_info); + + mutex_lock(&dbs_mutex); sysfs_remove_group(&policy->kobj, &dbs_attr_group); dbs_enable--; diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 1911d1729353..246ae147df74 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -78,15 +78,10 @@ static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ /* - * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug - * lock and dbs_mutex. cpu_hotplug lock should always be held before - * dbs_mutex. If any function that can potentially take cpu_hotplug lock - * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then - * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock - * is recursive for the same process. -Venki - * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it - * would deadlock with cancel_delayed_work_sync(), which is needed for proper - * raceless workqueue teardown. + * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on + * different CPUs. It protects dbs_enable in governor start/stop. It also + * serializes governor limit_change with do_dbs_timer. We do not want + * do_dbs_timer to run when user is changing the governor or limits. */ static DEFINE_MUTEX(dbs_mutex); @@ -494,11 +489,10 @@ static void do_dbs_timer(struct work_struct *work) delay -= jiffies % delay; - if (lock_policy_rwsem_write(cpu) < 0) - return; + mutex_lock(&dbs_mutex); if (!dbs_info->enable) { - unlock_policy_rwsem_write(cpu); + mutex_unlock(&dbs_mutex); return; } @@ -517,7 +511,7 @@ static void do_dbs_timer(struct work_struct *work) dbs_info->freq_lo, CPUFREQ_RELATION_H); } queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay); - unlock_policy_rwsem_write(cpu); + mutex_unlock(&dbs_mutex); } static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) @@ -598,14 +592,15 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, max(min_sampling_rate, latency * LATENCY_MULTIPLIER); } - dbs_timer_init(this_dbs_info); - mutex_unlock(&dbs_mutex); + + dbs_timer_init(this_dbs_info); break; case CPUFREQ_GOV_STOP: - mutex_lock(&dbs_mutex); dbs_timer_exit(this_dbs_info); + + mutex_lock(&dbs_mutex); sysfs_remove_group(&policy->kobj, &dbs_attr_group); dbs_enable--; mutex_unlock(&dbs_mutex); From 37c90e8887efd218dc4af949b7f498ca2da4af9f Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 2 Jul 2009 17:08:31 -0700 Subject: [PATCH 2/6] [CPUFREQ] Mark policy_rwsem as going static in cpufreq.c wont be exported lock_policy_rwsem_* and unlock_policy_rwsem_* routines in cpufreq.c are currently exported to drivers. Improper use of those locks can result in deadlocks and it is better to keep the locks localized. Two previous in-kernel users of these interfaces (ondemand and conservative), do not use this interfaces any more. Schedule them for removal. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Dave Jones --- Documentation/feature-removal-schedule.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index f8cd450be9aa..09e031c55887 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -458,3 +458,13 @@ Why: Remove the old legacy 32bit machine check code. This has been but the old version has been kept around for easier testing. Note this doesn't impact the old P5 and WinChip machine check handlers. Who: Andi Kleen + +---------------------------- + +What: lock_policy_rwsem_* and unlock_policy_rwsem_* will not be + exported interface anymore. +When: 2.6.33 +Why: cpu_policy_rwsem has a new cleaner definition making it local to + cpufreq core and contained inside cpufreq.c. Other dependent + drivers should not use it in order to safely avoid lockdep issues. +Who: Venkatesh Pallipadi From 5a75c82828e7c088ca6e7b4827911dc29cc8e774 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 2 Jul 2009 17:08:32 -0700 Subject: [PATCH 3/6] [CPUFREQ] Cleanup locking in ondemand governor Redesign the locking inside ondemand driver. Make dbs_mutex handle all the global state changes inside the driver and invent a new percpu mutex to serialize percpu timer and frequency limit change. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq_ondemand.c | 62 +++++++++++++----------------- 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 246ae147df74..d6ba14276bb1 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -70,8 +70,13 @@ struct cpu_dbs_info_s { unsigned int freq_lo_jiffies; unsigned int freq_hi_jiffies; int cpu; - unsigned int enable:1, - sample_type:1; + unsigned int sample_type:1; + /* + * percpu mutex that serializes governor limit change with + * do_dbs_timer invocation. We do not want do_dbs_timer to run + * when user is changing the governor or limits. + */ + struct mutex timer_mutex; }; static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); @@ -79,9 +84,7 @@ static unsigned int dbs_enable; /* number of CPUs using this policy */ /* * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on - * different CPUs. It protects dbs_enable in governor start/stop. It also - * serializes governor limit_change with do_dbs_timer. We do not want - * do_dbs_timer to run when user is changing the governor or limits. + * different CPUs. It protects dbs_enable in governor start/stop. */ static DEFINE_MUTEX(dbs_mutex); @@ -187,13 +190,18 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy, return freq_hi; } +static void ondemand_powersave_bias_init_cpu(int cpu) +{ + struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu); + dbs_info->freq_table = cpufreq_frequency_get_table(cpu); + dbs_info->freq_lo = 0; +} + static void ondemand_powersave_bias_init(void) { int i; for_each_online_cpu(i) { - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, i); - dbs_info->freq_table = cpufreq_frequency_get_table(i); - dbs_info->freq_lo = 0; + ondemand_powersave_bias_init_cpu(i); } } @@ -235,12 +243,10 @@ static ssize_t store_sampling_rate(struct cpufreq_policy *unused, unsigned int input; int ret; ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; mutex_lock(&dbs_mutex); - if (ret != 1) { - mutex_unlock(&dbs_mutex); - return -EINVAL; - } dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate); mutex_unlock(&dbs_mutex); @@ -254,13 +260,12 @@ static ssize_t store_up_threshold(struct cpufreq_policy *unused, int ret; ret = sscanf(buf, "%u", &input); - mutex_lock(&dbs_mutex); if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD || input < MIN_FREQUENCY_UP_THRESHOLD) { - mutex_unlock(&dbs_mutex); return -EINVAL; } + mutex_lock(&dbs_mutex); dbs_tuners_ins.up_threshold = input; mutex_unlock(&dbs_mutex); @@ -358,9 +363,6 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) struct cpufreq_policy *policy; unsigned int j; - if (!this_dbs_info->enable) - return; - this_dbs_info->freq_lo = 0; policy = this_dbs_info->cur_policy; @@ -488,13 +490,7 @@ static void do_dbs_timer(struct work_struct *work) int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); delay -= jiffies % delay; - - mutex_lock(&dbs_mutex); - - if (!dbs_info->enable) { - mutex_unlock(&dbs_mutex); - return; - } + mutex_lock(&dbs_info->timer_mutex); /* Common NORMAL_SAMPLE setup */ dbs_info->sample_type = DBS_NORMAL_SAMPLE; @@ -511,7 +507,7 @@ static void do_dbs_timer(struct work_struct *work) dbs_info->freq_lo, CPUFREQ_RELATION_H); } queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay); - mutex_unlock(&dbs_mutex); + mutex_unlock(&dbs_info->timer_mutex); } static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) @@ -520,8 +516,6 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); delay -= jiffies % delay; - dbs_info->enable = 1; - ondemand_powersave_bias_init(); dbs_info->sample_type = DBS_NORMAL_SAMPLE; INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); queue_delayed_work_on(dbs_info->cpu, kondemand_wq, &dbs_info->work, @@ -530,7 +524,6 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) { - dbs_info->enable = 0; cancel_delayed_work_sync(&dbs_info->work); } @@ -549,19 +542,15 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, if ((!cpu_online(cpu)) || (!policy->cur)) return -EINVAL; - if (this_dbs_info->enable) /* Already enabled */ - break; - mutex_lock(&dbs_mutex); - dbs_enable++; rc = sysfs_create_group(&policy->kobj, &dbs_attr_group); if (rc) { - dbs_enable--; mutex_unlock(&dbs_mutex); return rc; } + dbs_enable++; for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; j_dbs_info = &per_cpu(cpu_dbs_info, j); @@ -575,6 +564,8 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, } } this_dbs_info->cpu = cpu; + ondemand_powersave_bias_init_cpu(cpu); + mutex_init(&this_dbs_info->timer_mutex); /* * Start the timerschedule work, when this governor * is used for first time @@ -602,20 +593,21 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, mutex_lock(&dbs_mutex); sysfs_remove_group(&policy->kobj, &dbs_attr_group); + mutex_destroy(&this_dbs_info->timer_mutex); dbs_enable--; mutex_unlock(&dbs_mutex); break; case CPUFREQ_GOV_LIMITS: - mutex_lock(&dbs_mutex); + mutex_lock(&this_dbs_info->timer_mutex); if (policy->max < this_dbs_info->cur_policy->cur) __cpufreq_driver_target(this_dbs_info->cur_policy, policy->max, CPUFREQ_RELATION_H); else if (policy->min > this_dbs_info->cur_policy->cur) __cpufreq_driver_target(this_dbs_info->cur_policy, policy->min, CPUFREQ_RELATION_L); - mutex_unlock(&dbs_mutex); + mutex_unlock(&this_dbs_info->timer_mutex); break; } return 0; From ee88415caf736b89500f16e0a545614541a45005 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 2 Jul 2009 17:08:33 -0700 Subject: [PATCH 4/6] [CPUFREQ] Cleanup locking in conservative governor Redesign the locking inside conservative driver. Make dbs_mutex handle all the global state changes inside the driver and invent a new percpu mutex to serialize percpu timer and frequency limit change. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq_conservative.c | 34 ++++++++++---------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 58889f26029a..57490502b21c 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -63,7 +63,12 @@ struct cpu_dbs_info_s { unsigned int down_skip; unsigned int requested_freq; int cpu; - unsigned int enable:1; + /* + * percpu mutex that serializes governor limit change with + * do_dbs_timer invocation. We do not want do_dbs_timer to run + * when user is changing the governor or limits. + */ + struct mutex timer_mutex; }; static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); @@ -71,9 +76,7 @@ static unsigned int dbs_enable; /* number of CPUs using this policy */ /* * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on - * different CPUs. It protects dbs_enable in governor start/stop. It also - * serializes governor limit_change with do_dbs_timer. We do not want - * do_dbs_timer to run when user is changing the governor or limits. + * different CPUs. It protects dbs_enable in governor start/stop. */ static DEFINE_MUTEX(dbs_mutex); @@ -138,9 +141,6 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, struct cpufreq_policy *policy; - if (!this_dbs_info->enable) - return 0; - policy = this_dbs_info->cur_policy; /* @@ -483,17 +483,12 @@ static void do_dbs_timer(struct work_struct *work) delay -= jiffies % delay; - mutex_lock(&dbs_mutex); - - if (!dbs_info->enable) { - mutex_unlock(&dbs_mutex); - return; - } + mutex_lock(&dbs_info->timer_mutex); dbs_check_cpu(dbs_info); queue_delayed_work_on(cpu, kconservative_wq, &dbs_info->work, delay); - mutex_unlock(&dbs_mutex); + mutex_unlock(&dbs_info->timer_mutex); } static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) @@ -502,7 +497,6 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); delay -= jiffies % delay; - dbs_info->enable = 1; INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); queue_delayed_work_on(dbs_info->cpu, kconservative_wq, &dbs_info->work, delay); @@ -510,7 +504,6 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) { - dbs_info->enable = 0; cancel_delayed_work_sync(&dbs_info->work); } @@ -529,9 +522,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, if ((!cpu_online(cpu)) || (!policy->cur)) return -EINVAL; - if (this_dbs_info->enable) /* Already enabled */ - break; - mutex_lock(&dbs_mutex); rc = sysfs_create_group(&policy->kobj, &dbs_attr_group); @@ -555,6 +545,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, this_dbs_info->down_skip = 0; this_dbs_info->requested_freq = policy->cur; + mutex_init(&this_dbs_info->timer_mutex); dbs_enable++; /* * Start the timerschedule work, when this governor @@ -596,6 +587,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, mutex_lock(&dbs_mutex); sysfs_remove_group(&policy->kobj, &dbs_attr_group); dbs_enable--; + mutex_destroy(&this_dbs_info->timer_mutex); /* * Stop the timerschedule work, when this governor @@ -611,7 +603,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, break; case CPUFREQ_GOV_LIMITS: - mutex_lock(&dbs_mutex); + mutex_lock(&this_dbs_info->timer_mutex); if (policy->max < this_dbs_info->cur_policy->cur) __cpufreq_driver_target( this_dbs_info->cur_policy, @@ -620,7 +612,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, __cpufreq_driver_target( this_dbs_info->cur_policy, policy->min, CPUFREQ_RELATION_L); - mutex_unlock(&dbs_mutex); + mutex_unlock(&this_dbs_info->timer_mutex); break; } From 3f4a782b5ce2698b1870b5a7b573cd721d4fce33 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 3 Jul 2009 11:25:16 -0400 Subject: [PATCH 5/6] [CPUFREQ] fix (utter) cpufreq_add_dev mess OK, I've tried to clean it up the best I could, but please test this with concurrent cpu hotplug and cpufreq add/remove in loops. I'm sure we will make other interesting findings. This is step one of fixing the overall locking dependency mess in cpufreq. Signed-off-by: Mathieu Desnoyers CC: Venkatesh Pallipadi CC: rjw@sisk.pl CC: mingo@elte.hu CC: Shaohua Li CC: Pekka Enberg CC: Dave Young CC: "Rafael J. Wysocki" CC: Rusty Russell CC: sven.wegener@stealer.net CC: cpufreq@vger.kernel.org CC: Thomas Renninger Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq.c | 65 ++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c7fe16e0474b..c668ac855f71 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -761,6 +761,10 @@ static struct kobj_type ktype_cpufreq = { * cpufreq_add_dev - add a CPU device * * Adds the cpufreq interface for a CPU device. + * + * The Oracle says: try running cpufreq registration/unregistration concurrently + * with with cpu hotplugging and all hell will break loose. Tried to clean this + * mess up, but more thorough testing is needed. - Mathieu */ static int cpufreq_add_dev(struct sys_device *sys_dev) { @@ -804,15 +808,12 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) goto nomem_out; } if (!alloc_cpumask_var(&policy->cpus, GFP_KERNEL)) { - kfree(policy); ret = -ENOMEM; - goto nomem_out; + goto err_free_policy; } if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) { - free_cpumask_var(policy->cpus); - kfree(policy); ret = -ENOMEM; - goto nomem_out; + goto err_free_cpumask; } policy->cpu = cpu; @@ -820,7 +821,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) /* Initially set CPU itself as the policy_cpu */ per_cpu(policy_cpu, cpu) = cpu; - lock_policy_rwsem_write(cpu); + ret = (lock_policy_rwsem_write(cpu) < 0); + WARN_ON(ret); init_completion(&policy->kobj_unregister); INIT_WORK(&policy->update, handle_update); @@ -833,7 +835,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) ret = cpufreq_driver->init(policy); if (ret) { dprintk("initialization failed\n"); - goto err_out; + goto err_unlock_policy; } policy->user_policy.min = policy->min; policy->user_policy.max = policy->max; @@ -858,15 +860,21 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) /* Check for existing affected CPUs. * They may not be aware of it due to CPU Hotplug. */ - managed_policy = cpufreq_cpu_get(j); /* FIXME: Where is this released? What about error paths? */ + managed_policy = cpufreq_cpu_get(j); if (unlikely(managed_policy)) { /* Set proper policy_cpu */ unlock_policy_rwsem_write(cpu); per_cpu(policy_cpu, cpu) = managed_policy->cpu; - if (lock_policy_rwsem_write(cpu) < 0) - goto err_out_driver_exit; + if (lock_policy_rwsem_write(cpu) < 0) { + /* Should not go through policy unlock path */ + if (cpufreq_driver->exit) + cpufreq_driver->exit(policy); + ret = -EBUSY; + cpufreq_cpu_put(managed_policy); + goto err_free_cpumask; + } spin_lock_irqsave(&cpufreq_driver_lock, flags); cpumask_copy(managed_policy->cpus, policy->cpus); @@ -877,12 +885,14 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) ret = sysfs_create_link(&sys_dev->kobj, &managed_policy->kobj, "cpufreq"); - if (ret) - goto err_out_driver_exit; - - cpufreq_debug_enable_ratelimit(); - ret = 0; - goto err_out_driver_exit; /* call driver->exit() */ + if (!ret) + cpufreq_cpu_put(managed_policy); + /* + * Success. We only needed to be added to the mask. + * Call driver->exit() because only the cpu parent of + * the kobj needed to call init(). + */ + goto out_driver_exit; /* call driver->exit() */ } } #endif @@ -892,25 +902,25 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &sys_dev->kobj, "cpufreq"); if (ret) - goto err_out_driver_exit; + goto out_driver_exit; /* set up files for this cpu device */ drv_attr = cpufreq_driver->attr; while ((drv_attr) && (*drv_attr)) { ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr)); if (ret) - goto err_out_driver_exit; + goto err_out_kobj_put; drv_attr++; } if (cpufreq_driver->get) { ret = sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr); if (ret) - goto err_out_driver_exit; + goto err_out_kobj_put; } if (cpufreq_driver->target) { ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); if (ret) - goto err_out_driver_exit; + goto err_out_kobj_put; } spin_lock_irqsave(&cpufreq_driver_lock, flags); @@ -928,12 +938,14 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) continue; dprintk("CPU %u already managed, adding link\n", j); - cpufreq_cpu_get(cpu); + managed_policy = cpufreq_cpu_get(cpu); cpu_sys_dev = get_cpu_sysdev(j); ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj, "cpufreq"); - if (ret) + if (ret) { + cpufreq_cpu_put(managed_policy); goto err_out_unregister; + } } policy->governor = NULL; /* to assure that the starting sequence is @@ -965,17 +977,20 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) per_cpu(cpufreq_cpu_data, j) = NULL; spin_unlock_irqrestore(&cpufreq_driver_lock, flags); +err_out_kobj_put: kobject_put(&policy->kobj); wait_for_completion(&policy->kobj_unregister); -err_out_driver_exit: +out_driver_exit: if (cpufreq_driver->exit) cpufreq_driver->exit(policy); -err_out: +err_unlock_policy: unlock_policy_rwsem_write(cpu); +err_free_cpumask: + free_cpumask_var(policy->cpus); +err_free_policy: kfree(policy); - nomem_out: module_put(cpufreq_driver->owner); module_out: From a2e1b4c31257c07f148a89eb7eea7ca959fd0642 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Sun, 26 Jul 2009 10:55:25 -0500 Subject: [PATCH 6/6] [CPUFREQ] Powernow-k8: support family 0xf with 2 low p-states Provide support for family 0xf processors with 2 P-states below the elevator voltage. Remove the checks that prevent this configuration from being supported and increase the transition voltage to prevent errors during the transition. Signed-off-by: Mark Langsdorf Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 30 +++++++++-------------- arch/x86/kernel/cpu/cpufreq/powernow-k8.h | 3 ++- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 81cbe64ed6b4..ae068f59603f 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -299,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate) static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid) { - if (core_voltage_pre_transition(data, reqvid)) + if (core_voltage_pre_transition(data, reqvid, reqfid)) return 1; if (core_frequency_transition(data, reqfid)) @@ -327,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data, /* Phase 1 - core voltage transition ... setup voltage */ static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid) + u32 reqvid, u32 reqfid) { u32 rvosteps = data->rvo; u32 savefid = data->currfid; - u32 maxvid, lo; + u32 maxvid, lo, rvomult = 1; dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " "reqvid 0x%x, rvo 0x%x\n", smp_processor_id(), data->currfid, data->currvid, reqvid, data->rvo); + if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) + rvomult = 2; + rvosteps *= rvomult; rdmsr(MSR_FIDVID_STATUS, lo, maxvid); maxvid = 0x1f & (maxvid >> 16); dprintk("ph1 maxvid=0x%x\n", maxvid); @@ -351,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, return 1; } - while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { + while ((rvosteps > 0) && + ((rvomult * data->rvo + data->currvid) > reqvid)) { if (data->currvid == maxvid) { rvosteps = 0; } else { @@ -384,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) u32 vcoreqfid, vcocurrfid, vcofiddiff; u32 fid_interval, savevid = data->currvid; - if ((reqfid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX "ph2: illegal lo-lo transition " - "0x%x 0x%x\n", reqfid, data->currfid); - return 1; - } - if (data->currfid == reqfid) { printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid); @@ -407,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid : vcoreqfid - vcocurrfid; + if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) + vcofiddiff = 0; + while (vcofiddiff > 2) { (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); @@ -1081,14 +1081,6 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, return 0; } - if ((fid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX - "ignoring illegal change in lo freq table-%x to 0x%x\n", - data->currfid, fid); - return 1; - } - dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", smp_processor_id(), fid, vid); freqs.old = find_khz_freq_from_fid(data->currfid); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index c9c1190b5e1f..02ce824073cb 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -215,7 +215,8 @@ struct pst_s { #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) -static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); +static int core_voltage_pre_transition(struct powernow_k8_data *data, + u32 reqvid, u32 regfid); static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);