Search     or:     and:
 LINUX 
 Language 
 Kernel 
 Package 
 Book 
 Test 
 OS 
 Forum 
iakovlev.org
by Mike Rieker

Introduction
Рассматривается мульти-процессорная система . Пусть у нас имеется список трэдов , стоящих в очереди . Надо понять , как сделать так , чтобы очередной тред запускался только одним процессором .
Если вы не пишете мульти-процессорную ось , то эта статья не для вас .


Getting along without them
So let's take our example started above. We have a list of threads that are ready to be executed, and we want to be sure that only one cpu will start executing the thread. Here's the idea:
*lock all other cpu's out of the list of threads waiting for a cpu

*unlink the first one that's waiting, if any

*unlock the list

*if we didn't find a thread, repeat the whole thing

*else, jump to the thread


Now how do we do that 'lock all other cpu's...'?

If we just:

[view code in window]
static int threads_waiting_to_run_lock = 0;
static Thread *threads_waiting_to_run = NULL;
static Thread **threads_waiting_to_run_end = &threads_waiting_to_run;

Thread *find_a_thread_to_run (void)
{
Thread *thread_to_start;

do {
while (threads_waiting_to_run_lock) {} // repeat while some other cpu is using the queue
threads_waiting_to_run_lock = 1; // tell other cpu's the queue is being used


thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) {
threads_waiting_to_run = thread_to_start -> next_thread;
if (threads_waiting_to_run == NULL) threads_waiting_to_run_end = &threads_waiting_to_run;
}


lock = 0; // tell other cpu's the queue is no longer busy
} while (thread_to_start == NULL); // repeat if I didn't get anything to do


return (thread_to_start);
}

That won't work because if two cpu's simultaneously hit the while statement, they will both see lock as being zero, then they both will set it to one. And they both will dequeue the same top thread, which is no good.

No simple C statements will solve this problem.


What makes it work
So, for Intel processors, there is the 'lock' instruction prefix. This says, that for the following instruction, this instruction shall be the only one that accesses that memory location. So we can make a routine like this:

[view code in window]
int test_and_set (int new_value, int *lock_pointer);


.globl test_and_set
test_and_set:
movl 4(%esp),%eax # get new_value into %eax
movl 8(%esp),%edx # get lock_pointer into %edx
lock # next instruction is locked
xchgl %eax,(%edx) # swap %eax with what is stored in (%edx)
# ... and don't let any other cpu touch that
# ... memory location while you're swapping
ret # return the old value that's in %eax

Now we can correctly make our routine:

[view code in window]
Thread *find_a_thread_to_run (void)
{
Thread *thread_to_start;

do {
while (test_and_set (1, &threads_waiting_to_run_lock)) {} // repeat while some other cpu is using the queue
// ... then tell other cpu's the queue is being used

thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) {
threads_waiting_to_run = thread_to_start -> next_thread;
if (threads_waiting_to_run == NULL) threads_waiting_to_run_end = &threads_waiting_to_run;
}

threads_waiting_to_run_lock = 0; // tell other cpu's the queue is no longer busy
} while (thread_to_start == NULL); // repeat if I didn't get anything to do

return (thread_to_start);
}


So the 'while' loop can be said to be 'spinning' on that test_and_set call while some other cpu is using the lock.


Dealing with interrupts
Now for the complications...

You may ask yourself, isn't this thing going to sit there forever in that do..while loop if there is nothing in the queue? What puts things in the queue?

So some device gives an hardware interrupt while we're in the loop. This interrupt routine checks the disk or keyboard status, does it's thing, and realizes it is time to wake up a thread that is waiting for the I/O. Here is where it puts the thread in the 'threads_waiting_to_run' queue.

Now it can't just:

[view code in window]
void wake_thread (Thread *thread_to_wake)
{
thread_to_wake -> next_thread = NULL;
*threads_waiting_to_run_end = thread_to_wake;
threads_waiting_to_run_end = &(thread_to_wake -> next_thread);
}

... because the interrupt may have happened between the second and third lines of:

[view code in window]
thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) {
threads_waiting_to_run = thread_to_start -> next_thread;
if (threads_waiting_to_run == NULL) threads_waiting_to_run_end = &threads_waiting_to_run;
}

So you in essence have the following happening:

[view code in window]
thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) {


--> interrupt
thread_to_wake -> next_thread = NULL;
*threads_waiting_to_run_end = thread_to_wake;
threads_waiting_to_run_end = &(thread_to_wake -> next_thread);
<-- return from interrupt


threads_waiting_to_run = thread_to_start -> next_thread;
if (threads_waiting_to_run == NULL) threads_waiting_to_run_end = &threads_waiting_to_run;
}

... and we end up possibly losing our 'thread_to_wake'.

To solve this, we just inhibit hardware interrupt delivery as well as lock our spinlock:

[view code in window]
Thread *find_a_thread_to_run (void)
{
Thread *thread_to_start;

do {
inhib_hw_int_delivery (); // make sure interrupt routine doesn't interfere
while (test_and_set (1, &threads_waiting_to_run_lock)) {} // repeat while some other cpu is using the queue
// ... then tell other cpu's the queue is being used

thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) {
threads_waiting_to_run = thread_to_start -> next_thread;
if (threads_waiting_to_run == NULL) threads_waiting_to_run_end = &threads_waiting_to_run;
}

threads_waiting_to_run_lock = 0; // tell other cpu's the queue is no longer busy
enable_hw_int_delivery (); // allow interrupts to happen now
} while (thread_to_start == NULL); // repeat if I didn't get anything to do

return (thread_to_start);
}



Interrupts on a multiprocessor
Now we're almost done. If we are on a multiprocessor, one of the cpu's might be executing the do..while that is waiting for something to work on, and another cpu might be doing the interrupt processing. So we end up with our bad situation again, where we have the four statements executing simultaneously. Easy to fix. We just put our spinlock around the statements in our interrupt routine as well:

[view code in window]
void wake_thread (Thread *thread_to_wake)
{
int hwi;

thread_to_wake -> next_thread = NULL;

hwi = inhib_hw_int_delivery (); // make sure interrupt delivery is inhibited
while (test_and_set (1, &threads_waiting_to_run_lock)) {} // repeat while some other cpu is using the queue
// ... then tell other cpu's the queue is being used
*threads_waiting_to_run_end = thread_to_wake;
threads_waiting_to_run_end = &(thread_to_wake -> next_thread);
if (hwi) enable_hw_int_delivery (); // restore interrupt delivery state
}



Rules to live by
It seems very complicated from all that. Well, if you follow a couple simple rules, it isn't!
1) When modifying a variable that can possibly be modified by more than one cpu at a time, then you need to protect it with a spinlock. You must *always* have this spinlock when accessing that variable.
2) If the variable can also be modified by an interrupt routine, you must disable (at least that) interrupt while the spinlock is being held.


Uniprocessors
So now, you uniprocessor people wonder why all this is necessary. For a uniprocessor, you have:


[view code in window]
Thread *find_a_thread_to_run (void)
{
do {
inhib_hw_int_delivery (); // make sure interrupt routine doesn't interfere

thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) threads_waiting_to_run = thread_to_start -> next_thread;

enable_hw_int_delivery (); // allow interrupts to happen now
} while (thread_to_start == NULL); // repeat if I didn't get anything to do
return (thread_to_start);
}

void wake_thread (Thread *thread_to_wake)
{
int hwi;

thread_to_wake -> next_thread = NULL;
hwi = inhib_hw_int_delivery (); // make sure interrupt delivery is inhibited
*threads_waiting_to_run_end = thread_to_wake;
threads_waiting_to_run_end = &(thread_to_wake -> next_thread);
if (hwi) enable_hw_int_delivery (); // restore interrupt delivery state
}

So you can see that it takes all of 2 extra lines per routine to make it multiprocessor ready!
Part 2 talks about some more practical stuff.


Simultaneous spinlocks
Now after you get into this stuff, you may find that you have to have more than one spinlock at a time. This can lead to problems. Consider this:

Routine A
lock spinlock X
maybe do some work
lock spinlock Y
do some more work
unlock spinlock Y
unlock spinlock X

And Routine B
lock spinlock Y
maybe do some work
lock spinlock X
do some more work
unlock spinlock X
unlock spinlock Y

So CPU #1 comes along and starts routine A, while CPU #2 starts routine B. Well, they'll never finish, because CPU #1 will be waiting for CPU #2 to release spinlock Y and CPU #2 will be waiting for CPU #1 to release spinlock X. So we have another simple rule:
* When locking more than one spinlock, they must always be locked in the same order.

So for our example, both routines need to be changed so that they either both lock X then Y or they both lock Y then X. You may be thinking, 'I might not be able to do that in ALL cases!' Easy to fix, replace the two spinlocks with one, say Z.

Now I am terrible at checking to make sure I do everything in order. Computers are good at that sort of thing. So rather than just using an 'int' for my spinlocks, I use a struct as follows:

[view code in window]
typedef struct { char spinlock_flag; // the usual 0=unlocked, 1=locked
unsigned char spinlock_level; // 'locking' order
} Spinlock;

Then I have a spinlock_wait routine that checks to make sure my new spinlock's level is .gt. the last spinlock's level that was locked by this cpu. If I try to do it out of order, the routine panics/BSOD's on the spot.


Interrupts
Another question you may wonder, must I always inhibit hardware interrupt delivery when I have a spinlock?

No. Only if the data being worked with can also be accessed by an interrupt routine. And then, you only have to disable those interrupts that have access to it.

For example, I don't allow my interrupt routines to do malloc's. So the malloc routine can run with interrupts enabled, even when it has the spinlock. Same with pagetables, disk cache pages, and other stuff like that.

You must, however, block any pre-emptive scheduling while you have a any spinlock. Or at least, you will make things *very* complicated if you don't.

So I ended up with several 'classes' of spinlocks:

low priority - these run with all hardware interrupt delivery enabled
interrupt level - there is one per IRQ level. IRQ 0 is the highest, IRQ 7 is the lowest. when one of these is set on a cpu, it inhibits that IRQ and any lower priority interrupts
high priority - these run with all hardware interrupt delivery inhibited

So if a cpu has one of the low priority spinlocks, it is running with hardware interrupt delivery enabled (but the pre-emptive scheduler is disabled).

There is one spinlock per interrupt level. For example, the keyboard driver uses IRQ 1. So whenever the keyboard interrupt routine is active, I have the cpu take the IRQ 1 spinlock. Whenever the main keyboard driver routine is accessing data that the interrupt routine would access, I take the IRQ 1 spinlock. So it in essence acts as a 'inhibit interrupt delivery' mechanism that works across cpu's in an SMP system.

The high-priority interrupts block all interrupt delivery on the cpu. These are used for routines (like wake_a_thread) that need to be callable from any interrupt level.


Performance
This is a big thing about spinlocks. It's easy to make your performance go to pot with poor use of spinlocks.

Consider that you can *theoretically* use just *one* spinlock, period. Whenever you enter kernel mode, inhibit all interrupt delivery and set your one-and-only spinlock. When you leave kernel mode, release the spinlock then enable interrupt delivery. Great!

Well, you can imagine then, that what you end up with is having just one cpu doing anything useful at a time in kernel mode, while the other(s) just grind away spinning. Really bad!

So what you do is try to come up with as many spinlocks as you can. Above I mentioned that you must combine X and Y if you have a case where you must set X then Y and another case where you must set Y then X. So now you have two limits to work with:
*Too few spinlocks and you have poor performance

*Too many spinlocks and you violate the ordering


Here's where the creativity comes in, working with those two limits to maximize performance. Strive to eliminate as much spinning as possible.

How does one measure spinlock performance? Make an array of counters like this:

[view code in window]
static int spin_counts[256];


Then increment spin_counts[spinlock_level] each time the 'test_and_set' fails. After a while of running stuff, find which spinlock_level has big counts (compared to the others). Try to break that smplock down into smaller ones if you can figure out how to.


Summary
They may seem sort of nebulous to begin with. But after you work with them, application becomes very scientific. I hope this little tutorial has helped clear up some issues and gives you confidence.


Next
In some cases, there are simple and effective alternatives to spinlocks.


Increment/Decrement
Consider the case:

[view code in window]
void dec_thread_refcount (Thread *thread)
{
int new_refcount;

acquire_spinlock (&(thread -> spinlock)); // lock access to thread -> refcount
new_refcount = -- (thread -> refcount); // decrement it, save new value
release_spinlock (&(thread -> spinlock)); // unlock
if (new_refcount == 0) kfree (thread); // if no one is using it anymore, free it
}

Seems like a lot of work acquiring and releasing the spinlock just to decrement the refcount. Well, remember the 'lock' prefix instruction we used to make the test_and_set thing? We can be a little fancier about it and make a 'dec_and_testz' routine:

[view code in window]
int locked_dec_and_testz (int *refcount); // decrement *refcount, return whether or not it went zero

.globl locked_dec_and_testz
locked_dec_and_testz:
xorl %eax,%eax # clear all of %eax
movl 4(%esp),%edx # get pointer to refcount
lock # keep other cpu's out of next instruction
decl (%edx) # decrement refcount, keeping other cpu's out of refcount
setz %al # set %eax=1 if result is zero, set %eax=0 otherwise
ret # return %eax as the result

So now we can write:

[view code in window]
void dec_thread_refcount (Thread *thread)
{
if (locked_dec_and_testz (&(thread -> refcount)) {
kfree (thread);
}
}

Now this has a little gotcha. 'refcount' must now be thought of as being a variable not protected by a spinlock, but by 'lock instruction prefix' access only! So any modifications to refcount must be done with the lock prefix. So we can no longer:

[view code in window]
acquire_spinlock (&(thread -> spinlock));
(thread -> refcount) ++;
release_spinlock (&(thread -> spinlock));

... because the locked_dec_and_testz routine might be in progress on another cpu, and our 'spinlock' won't do anything to stop it! So we have to supply a 'locked_inc' routine:

[view code in window]
void locked_inc (int *refcount);

.globl locked_inc
locked_inc:
movl 4(%esp),%edx
lock
incl (%edx)
ret

But we still come out ahead, because there is no possibility of doing any spinning in any of these routines. (The actual spinning is done at the CPU bus level, which is very very fast).


Atomic arithmetic
Now we can generally apply increment/decrement to any single arithmetic operation on a single variable. It is possible to write routines to do add, subtract, or, and, xor, etc, and have the routine return the previous value.

For example, this routine or's in 'new_bits' into *value, and returns the previous contents of value:

[view code in window]
int atomic_or_rtnold (int new_bits, int *value);

.globl atomic_or_rtnold
atomic_or_rtnold:
movl 8(%esp),%edx # point to value
movl (%edx),%eax # sample the current contents of value
atomic_or_loop:
movl 4(%esp),%ecx # get the new_bits
orl %eax,%ecx # or them together
lock # bus lock the next instruction
cmpxchgl %ecx,(%edx) # if 'value' hasn't changed, store our new value there
jne atomic_or_loop # repeat if 'value' has changed
ret # return with old (sampled) contents of value

Now you notice this does have a little 'spin' in it, it has a loop back. But the loop is so short, that the chances of conflicting with other modifications of the *value is very slim, so it is highly unlikely that it will ever loop.

If we don't want the old value, we can do this:

[view code in window]
void atomic_or (int new_bits, int *value);

.globl atomic_or
atomic_or:
movl 4(%esp),%eax # get the new_bits
movl 8(%esp),%edx # point to value
lock # bus lock the next instruction
orl %eax,(%edx) # update value
ret

... and not have any loop in there at all.


Atomic linked stack
You can get 'nasty' and implement a LIFO linked stack with atomic operations. Suppose we have:

[view code in window]
typedef struct Block Block;
struct Block { Block *next_block;
...
};

static Block *free_blocks = NULL;

void free_a_block (Block *block_to_free)
{
Block *old_top_free_block;

do {
old_top_free_block = free_blocks;
block_to_free = old_top_free_block;
} while (!atomic_setif_ptr (&free_blocks, block_to_free, old_top_free_block));
}

atomic_setif_ptr says:

if (free_blocks != old_top_free_block) return (0);
else {
free_blocks = block_to_free;
return (1);
}

.globl atomic_setif_ptr
atomic_setif_ptr:
movl 4(%esp),%edx # get pointer to free_blocks
movl 8(%esp),%ecx # get block_to_free
movl 12(%esp),%eax # get old_free_top_block
lock # bus lock the next instruction
cmpxchgl %ecx,(%edx) # if free_blocks == old_free_top_block, then free_blocks = block_to_free
jne atomic_setif_failed
movl $1,%eax # success, return 1
ret
atomic_setif_failed:
xorl %eax,%eax # failure, return 0
ret

Now we can use that same routine to write the alloc routine:

[view code in window]
Block *alloc_a_block (void)

{
Block *sample_block;

do {
sample_block = free_blocks;
if (sample_block == NULL) break;
} while (!atomic_setif_ptr (&free_blocks, sample_block -> next_block, sample_block));
return (sample_block);
}

But again, the gotcha is that 'free_blocks' can only be modified with atomic operations. So if you must also scan the list, you will have to use a spinlock.


Summary
Simple spinlocked sequences can be replaced with atomic operations and can result in performance improvements. Now you have more fun stuff to play with!


Оставьте свой комментарий !

Ваше имя:
Комментарий:
Оба поля являются обязательными

 Автор  Комментарий к данной статье