// rwsem prototype implementation: many readers, only one writer at a time
typedef struct rwsem { // what rwsem needs to track
  spinlock L; // a lock of some sort (TBD): protects "owners" field
  // use (quick) spinlock b/c we're only changing one int in memory
  // "owners" defines the 3 "states" that the lock L is in
  int owners; // >0 (N readers), -1 (one writer), 0 (no one owns the lock)
  int max_owners; // need to cap the max len of waiters, so as not to run
		  // out of kernel mem (or consume too many CPU resources)
  wait_q waiting_readers; // list of all waiting readers (assume FCFS)
  wait_q waiting_writers; // list of all waiting writers (assume FCFS)
} rwsem_t;

// The state of the system at any point in time can be:
// 1. owners is <0, 0, or >0 [3 different states]
// 2. there could be waiting_readers or waiting_writers [2 more states]
// 3. Four API calls: un/lock read/write [4 APIs]
// meaning for each of the 4 APIs you have to consider each one of the 3x2=6
// states of the system

lock_read(rwsem_t *rws) // what if owners is <0, 0, or >0
{
  lock(L); // must lock before reading (next line) or modifying (line after) owners
  // what if there's a writer? Can't allow reader to proceed?
  // If there are waiting writers, then don't let new readers in to prevent
  // starving the writer.  Instead, WAIT the readers.  This is to ensure
  // that the current list of readers who are in their CS is "drained".
  if (rws->owners < 0 || !empty_queue(rws->waiting_writers)) {
    // exit(1); // exit is bad, see code samples below
    // better: block this reader, put them to sleep, in WAIT state
    add_to(waiting_readers); // add this thread to waiting readers queue
    // change thread state to WAITING (moved to WAIT state in scheduler)
    unlock(L);
    return;
  }
  if (rws->owners >= 0) { // no readers or at least one reader
    rws->owners++; // this is the CS of the spinlock L
    unlock(L);
    return; // successful return means we allowed the caller to enter the CS
  }
  unlock(L);
}
unlock_read(rwsem_t *rws) // what if owners is <0, 0, or >0
{
  lock(L);
  if (rws->owners > 0) { // must have at least least one reader
    rws->owners--;
    if (rws->owners==0 && !empty_queue(rws->waiting_writers)) {
      wakeup_one(waiting_writers); // move oldest (or only) waiting writer
				   // from WAIT to READY state
    }
    unlock(L);
    return; // successful return means we allowed the caller to enter the CS
  } else if (rws->owners <= 0) { // no owners or a writer
    unlock(L);
    BUG(); // some bug, someone not using the rwsem correctly
  }
  unlock(L);
}

lock_write(rwsem_t *rws) // what if owners is <0, 0, or >0
{
  // no lock owners
  lock(L);
  if (rws->owners == 0) {
    rws->owners == -1;
    unlock(L);
    return;
  }
  if (rws->owners > 0) { // there are one or more readers
    // need to block this writer (don't return error for same reasons as lock_read)
    add_to(waiting_writers); // add this thread to waiting writers queue
    // this writer should be waiting for all readers to be done (owners==0)
    // change thread state to WAITING (moved to WAIT state in scheduler)
    unlock(L);
    return;
  }
  if (rws->owners < 0) { // there is one writer already
    // do the same: put into waiting_writers
    // Note: rwsem is "exclusive" for writers (not non exclusive for
    // readers).  If you find that too many writers want to modify the
    // protected data at the same time, then your use of rwsem is flawed --
    // might as well use a much simpler/quicker lock like a mutex or spinlock.
  }
  unlock(L);
  // one or more readers

  // another writer?
}
unlock_write(rwsem_t *rws) // what if owners is <0, 0, or >0
{
  lock(L);
  if (rws->owners < 0) { // probably should check that it's "== -1"
			// (if < -1) BUG();
    rws->owners == 0;
    // check if there are any waiting readers and wake them up
    wakeup_all(waiting_readers); // move them from WAIT to READY state
    // ideally, wake up those who slept the longest first (fairness principle)
    unlock(L);
    return;
  } else { // no owners or some readers
    unlock(L); // note: since calling BUG() next, it won't matter if we
	       // unlock or not (kernel will still be hung and needs a
	       // reboot)
    BUG();
  }
  unlock(L);
}

lock_write_v2(rwsem_t *rws) // what if owners is <0, 0, or >0
{
  int i;
  lock(L);
  i = rws->owners;
  unlock(L); // right after unlocking here, someone ELSE can grab L and
	     // change rws->owners!!!  This is a classic race condition,
	     // when the value we're protecting can change.  Instead, our CS
	     // should use values that are "atomic" (as in databases, not
	     // changing).  Be careful not to break CSs into smaller units
	     // in the name of 'efficiency' but breaking data consistency!
  if (i == 0) {
    lock(L);
    rws->owners == -1;
    unlock(L);
    return;
  }
  //... other code from lock_write above
  unlock(L);
}

// example of how one might use rwsem
// assume what we want to protect is the inode owner (UID) field
// we read the inode/file's owner/UID many times, but often don't change it
// unless root called chown(2).
check_inode_permission()
{
  // code here
  lock_read(inode->i_rwsem); // programmer defined that inode->i_rwsem
			     // protects inode->i_uid
  read value of inode->i_uid; // this is the CS of the rwsem
  unlock_read(inode->i_rwsem);
  // code here
}
chown_inode_owner(int new_owner)
{
  // code here
  lock_write(inode->i_rwsem);
  inode->i_uid = new_owner; // this is the CS of the rwsem
  unlock_write(inode->i_rwsem);
  // code here
}

check_inode_permission2() // what if lock_read returned a success/failure code
{
  // alternative 1
  if (lock_read(inode->i_rwsem) == 0) { // success
    read value of inode->i_uid; // this is the CS of the rwsem
    unlock_read(inode->i_rwsem);
  } else { // lock_read failed
    // option 1:
    return FAILED; // return failed to caller, propagate to upper layers,
		   // then to syscall entry point, then to user application;
		   // user app may abort.
  }

  // alternative 2: users will write spinning loops code, bad CPU use
  while (lock_read(inode->i_rwsem) != 0)
    ;
  read value of inode->i_uid; // this is the CS of the rwsem
  unlock_read(inode->i_rwsem);
}
