2004-07-06 01:22:18 +00:00
|
|
|
#include <config.h>
|
|
|
|
#include <lladd/common.h>
|
|
|
|
|
2004-06-25 18:59:24 +00:00
|
|
|
#include <assert.h>
|
2004-06-28 21:10:10 +00:00
|
|
|
|
|
|
|
#include <lladd/transactional.h>
|
|
|
|
#include <lladd/bufferManager.h>
|
|
|
|
#include <lladd/constants.h>
|
|
|
|
|
|
|
|
#include "blobManager.h"
|
2004-07-15 00:42:36 +00:00
|
|
|
#include "pageFile.h"
|
2004-07-06 01:22:18 +00:00
|
|
|
#include <pbl/pbl.h>
|
2004-06-28 21:10:10 +00:00
|
|
|
|
2004-07-06 01:22:18 +00:00
|
|
|
#include <stdio.h>
|
2004-06-25 18:59:24 +00:00
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
static FILE * blobf0 = NULL, * blobf1 = NULL;
|
2004-06-26 02:05:24 +00:00
|
|
|
/**
|
|
|
|
This is a hash of hash tables. The outer hash maps from xid to
|
|
|
|
inner hash. The inner hash maps from rid to lsn.
|
|
|
|
*/
|
|
|
|
static pblHashTable_t * dirtyBlobs;
|
2004-06-25 18:59:24 +00:00
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
/** Plays a nasty trick on bufferManager to force it to read and write
|
|
|
|
blob_record_t items for us. Relies upon bufferManager (and
|
|
|
|
page.c's) trust in the rid.size field... */
|
|
|
|
static void readRawRecord(int xid, recordid rid, void * buf, int size) {
|
|
|
|
recordid blob_rec_rid = rid;
|
|
|
|
blob_rec_rid.size = size;
|
2004-07-06 01:22:18 +00:00
|
|
|
/*readRecord(xid, blob_rec_rid, buf);*/
|
|
|
|
Tread(xid, blob_rec_rid, buf);
|
2004-06-28 21:10:10 +00:00
|
|
|
}
|
|
|
|
|
2004-06-30 01:09:57 +00:00
|
|
|
static void writeRawRecord(int xid, recordid rid, const void * buf, int size) {
|
2004-06-28 21:10:10 +00:00
|
|
|
recordid blob_rec_rid = rid;
|
|
|
|
blob_rec_rid.size = size;
|
|
|
|
Tset(xid, blob_rec_rid, buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2004-06-28 22:48:02 +00:00
|
|
|
/* moved verbatim from bufferManger.c, then hacked up to use FILE * instead of ints. */
|
2004-06-25 18:59:24 +00:00
|
|
|
void openBlobStore() {
|
2004-07-06 01:22:18 +00:00
|
|
|
|
|
|
|
/* the r+ mode opens an existing file read /write */
|
2004-06-30 01:09:57 +00:00
|
|
|
if( ! (blobf0 = fopen(BLOB0_FILE, "r+"))) { /* file may not exist */
|
2004-07-06 01:22:18 +00:00
|
|
|
/* the w+ mode truncates, creates, and opens read / write */
|
|
|
|
if(!(blobf0 = fopen(BLOB0_FILE, "w+"))) { perror("Couldn't open or create blob 0 file"); abort(); }
|
2004-06-26 02:05:24 +00:00
|
|
|
}
|
2004-06-28 21:10:10 +00:00
|
|
|
|
|
|
|
DEBUG("blobf0 opened.\n");
|
|
|
|
|
|
|
|
if( ! (blobf1 = fopen(BLOB1_FILE, "r+"))) { /* file may not exist */
|
2004-07-06 01:22:18 +00:00
|
|
|
if(!(blobf1 = fopen(BLOB1_FILE, "w+"))) { perror("Couldn't open or create blob 1 file"); abort(); }
|
2004-06-25 18:59:24 +00:00
|
|
|
}
|
2004-06-28 21:10:10 +00:00
|
|
|
|
|
|
|
DEBUG("blobf1 opened.\n");
|
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
dirtyBlobs = pblHtCreate();
|
2004-06-25 18:59:24 +00:00
|
|
|
}
|
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
/** Discards all changes to dirty blobs, and closes the blob store.
|
|
|
|
|
|
|
|
@todo memory leak: Will leak memory if there are any outstanding
|
|
|
|
xacts that have written to blobs. Should explicitly abort them
|
2004-06-28 21:10:10 +00:00
|
|
|
instead of just invalidating the dirtyBlobs hash.
|
|
|
|
|
2004-07-04 00:46:49 +00:00
|
|
|
(If the you fix the above todo, don't forget to fix
|
2004-06-28 21:10:10 +00:00
|
|
|
bufferManager's simulateBufferManagerCrash.)
|
2004-06-26 02:05:24 +00:00
|
|
|
*/
|
2004-06-25 18:59:24 +00:00
|
|
|
void closeBlobStore() {
|
2004-06-28 21:10:10 +00:00
|
|
|
int ret = fclose(blobf0);
|
|
|
|
assert(!ret);
|
|
|
|
ret = fclose(blobf1);
|
|
|
|
assert(!ret);
|
2004-06-26 02:05:24 +00:00
|
|
|
blobf0 = NULL;
|
|
|
|
blobf1 = NULL;
|
|
|
|
|
|
|
|
pblHtDelete(dirtyBlobs);
|
2004-06-25 18:59:24 +00:00
|
|
|
}
|
|
|
|
|
2004-07-06 01:22:18 +00:00
|
|
|
recordid preAllocBlob(int xid, long blobSize) {
|
2004-06-26 02:05:24 +00:00
|
|
|
long fileSize = myFseek(blobf1, 0, SEEK_END);
|
2004-06-25 18:59:24 +00:00
|
|
|
blob_record_t blob_rec;
|
2004-07-14 20:49:18 +00:00
|
|
|
|
2004-06-25 18:59:24 +00:00
|
|
|
/* Allocate space for the blob entry. */
|
2004-06-30 01:09:57 +00:00
|
|
|
|
2004-07-06 01:22:18 +00:00
|
|
|
DEBUG("Allocing blob (size %ld)\n", blobSize);
|
2004-06-30 01:09:57 +00:00
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
assert(blobSize > 0); /* Don't support zero length blobs right now... */
|
|
|
|
|
2004-06-25 18:59:24 +00:00
|
|
|
/* First in buffer manager. */
|
2004-06-28 21:10:10 +00:00
|
|
|
|
2004-06-30 01:09:57 +00:00
|
|
|
recordid rid = Talloc(xid, sizeof(blob_record_t));
|
2004-06-26 02:05:24 +00:00
|
|
|
|
2004-06-30 01:09:57 +00:00
|
|
|
/** Finally, fix up the fields in the record that points to the blob.
|
|
|
|
The rest of this also should go into alloc.c
|
|
|
|
*/
|
2004-06-25 18:59:24 +00:00
|
|
|
|
|
|
|
blob_rec.fd = 0;
|
|
|
|
blob_rec.size = blobSize;
|
|
|
|
blob_rec.offset = fileSize;
|
2004-07-14 20:49:18 +00:00
|
|
|
|
|
|
|
setSlotType(rid.page, rid.slot, BLOB_SLOT);
|
2004-06-25 18:59:24 +00:00
|
|
|
rid.size = BLOB_SLOT;
|
2004-06-28 22:48:02 +00:00
|
|
|
|
|
|
|
/* Tset() needs to know to 'do the right thing' here, since we've
|
|
|
|
changed the size it has recorded for this record, and
|
|
|
|
writeRawRecord makes sure that that is the case. */
|
2004-06-30 01:09:57 +00:00
|
|
|
writeRawRecord (xid, rid, &blob_rec, sizeof(blob_record_t));
|
2004-06-28 21:10:10 +00:00
|
|
|
|
|
|
|
rid.size = blob_rec.size;
|
2004-06-25 18:59:24 +00:00
|
|
|
|
|
|
|
return rid;
|
2004-06-30 01:09:57 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
void allocBlob(int xid, lsn_t lsn, recordid rid) {
|
|
|
|
|
|
|
|
long fileSize = myFseek(blobf1, 0, SEEK_END);
|
|
|
|
blob_record_t blob_rec;
|
|
|
|
char zero = 0;
|
2004-07-14 20:49:18 +00:00
|
|
|
|
2004-06-30 01:09:57 +00:00
|
|
|
/* Allocate space for the blob entry. */
|
|
|
|
|
2004-07-06 01:22:18 +00:00
|
|
|
DEBUG("post Allocing blob (size %ld)\n", rid.size);
|
2004-06-30 01:09:57 +00:00
|
|
|
|
|
|
|
assert(rid.size > 0); /* Don't support zero length blobs right now... */
|
|
|
|
|
|
|
|
/* First in buffer manager. */
|
|
|
|
|
|
|
|
/* Read in record to get the correct offset, size for the blob*/
|
|
|
|
readRawRecord(xid, rid, &blob_rec, sizeof(blob_record_t));
|
|
|
|
|
|
|
|
myFseek(blobf0, fileSize + rid.size - 1, SEEK_SET);
|
|
|
|
myFseek(blobf1, fileSize + rid.size - 1, SEEK_SET);
|
|
|
|
|
|
|
|
if(1 != fwrite(&zero, sizeof(char), 1, blobf0)) { perror(NULL); abort(); }
|
|
|
|
if(1 != fwrite(&zero, sizeof(char), 1, blobf1)) { perror(NULL); abort(); }
|
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
}
|
|
|
|
|
2004-07-14 20:49:18 +00:00
|
|
|
static lsn_t * tripleHashLookup(int xid, recordid rid) {
|
2004-06-26 02:05:24 +00:00
|
|
|
pblHashTable_t * xidHash = pblHtLookup(dirtyBlobs, &xid, sizeof(xid));
|
|
|
|
if(xidHash == NULL) {
|
|
|
|
return NULL;
|
|
|
|
} else {
|
|
|
|
pblHashTable_t * pageXidHash = pblHtLookup(xidHash, &(rid.page), sizeof(int));
|
|
|
|
if(pageXidHash == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
return pblHtLookup(pageXidHash, &rid, sizeof(recordid));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void tripleHashInsert(int xid, recordid rid, lsn_t newLSN) {
|
|
|
|
pblHashTable_t * xidHash;
|
|
|
|
pblHashTable_t * pageXidHash;
|
|
|
|
lsn_t * copy;
|
|
|
|
|
|
|
|
xidHash = pblHtLookup(dirtyBlobs, &xid, sizeof(int)); /* Freed in doubleHashRemove */
|
|
|
|
|
|
|
|
if(xidHash == NULL) {
|
|
|
|
xidHash = pblHtCreate();
|
|
|
|
pblHtInsert(dirtyBlobs, &xid, sizeof(int), xidHash);
|
|
|
|
}
|
|
|
|
|
|
|
|
pageXidHash = pblHtLookup(xidHash, &(rid.page), sizeof(int));
|
2004-06-25 18:59:24 +00:00
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
if(pageXidHash == NULL) {
|
|
|
|
pageXidHash = pblHtCreate();
|
|
|
|
pblHtInsert(xidHash, &(rid.page), sizeof(int), pageXidHash);
|
|
|
|
}
|
2004-06-25 18:59:24 +00:00
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
copy = malloc(sizeof(lsn_t)); /* Freed in doubleHashRemove */
|
|
|
|
*copy = newLSN;
|
|
|
|
|
|
|
|
pblHtInsert(pageXidHash, &rid, sizeof(recordid), copy);
|
2004-06-25 18:59:24 +00:00
|
|
|
}
|
2004-06-28 21:10:10 +00:00
|
|
|
/*
|
2004-06-26 02:05:24 +00:00
|
|
|
static void tripleHashRemove(int xid, recordid rid) {
|
|
|
|
pblHashTable_t * xidHash = pblHtLookup(dirtyBlobs, &xid, sizeof(int));
|
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
if(xidHash) { / * Else, there was no xid, rid pair. * /
|
2004-06-26 02:05:24 +00:00
|
|
|
pblHashTable_t * pageXidHash = pblHtLookup(xidHash, &(rid.page), sizeof(int));
|
|
|
|
|
|
|
|
if(pageXidHash) {
|
2004-06-25 18:59:24 +00:00
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
lsn_t * delme = pblHtLookup(pageXidHash, &rid, sizeof(recordid));
|
|
|
|
pblHtRemove(pageXidHash, &rid, sizeof(recordid));
|
|
|
|
free(delme);
|
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
/ * We freed a member of pageXidHash. Is it empty? * /
|
2004-06-26 02:05:24 +00:00
|
|
|
if(!pblHtFirst(pageXidHash)) {
|
|
|
|
pblHtRemove(xidHash, &(rid.page), sizeof(int));
|
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
/ * Is xidHash now empty? * /
|
2004-06-26 02:05:24 +00:00
|
|
|
if(!pblHtFirst(xidHash)) {
|
|
|
|
pblHtRemove(dirtyBlobs, &xid, sizeof(int));
|
|
|
|
free(xidHash);
|
|
|
|
}
|
|
|
|
|
|
|
|
free(pageXidHash);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2004-06-28 21:10:10 +00:00
|
|
|
}*/
|
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
void readBlob(int xid, recordid rid, void * buf) {
|
|
|
|
|
2004-06-28 22:48:02 +00:00
|
|
|
/* We don't care if the blob is dirty, since the record from the
|
|
|
|
buffer manager will reflect that if it is.. */
|
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
blob_record_t rec;
|
2004-06-26 02:05:24 +00:00
|
|
|
FILE * fd;
|
2004-06-28 21:10:10 +00:00
|
|
|
long offset;
|
2004-06-26 02:05:24 +00:00
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
assert(buf);
|
2004-06-25 18:59:24 +00:00
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
readRawRecord(xid, rid, &rec, sizeof(blob_record_t));
|
|
|
|
|
|
|
|
fd = rec.fd ? blobf1 : blobf0;
|
|
|
|
|
|
|
|
offset = myFseek(fd, (long int) rec.offset, SEEK_SET);
|
|
|
|
|
|
|
|
DEBUG("reading blob at offset %d (%ld), size %ld, buffer %x\n", rec.offset, offset, rec.size, (unsigned int) buf);
|
2004-06-26 02:05:24 +00:00
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
assert(rec.offset == offset);
|
|
|
|
if(1 != fread(buf, rec.size, 1, fd)) {
|
|
|
|
|
|
|
|
if(feof(fd)) { printf("Unexpected eof!\n"); fflush(NULL); abort(); }
|
|
|
|
if(ferror(fd)) { printf("Error reading stream! %d", ferror(fd)); fflush(NULL); abort(); }
|
|
|
|
|
|
|
|
}
|
2004-06-26 02:05:24 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/** @todo dirtyBlobs should contain the highest LSN that wrote to the
|
|
|
|
current version of the dirty blob, and the lsn field should be
|
|
|
|
checked to be sure that it increases monotonically. */
|
2004-06-28 21:10:10 +00:00
|
|
|
void writeBlob(int xid, lsn_t lsn, recordid rid, const void * buf) {
|
2004-06-28 22:48:02 +00:00
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
/* First, determine if the blob is dirty. */
|
|
|
|
lsn_t * dirty = tripleHashLookup(xid, rid);
|
2004-06-28 22:48:02 +00:00
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
blob_record_t rec;
|
|
|
|
long offset;
|
2004-06-26 02:05:24 +00:00
|
|
|
FILE * fd;
|
2004-06-28 21:10:10 +00:00
|
|
|
int readcount;
|
2004-06-26 02:05:24 +00:00
|
|
|
|
2004-07-06 01:22:18 +00:00
|
|
|
DEBUG("Writing blob (size %ld)\n", rid.size);
|
2004-06-30 01:09:57 +00:00
|
|
|
|
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
/* Tread() raw record */
|
|
|
|
readRawRecord(xid, rid, &rec, sizeof(blob_record_t));
|
2004-06-26 02:05:24 +00:00
|
|
|
|
2004-06-30 01:09:57 +00:00
|
|
|
assert(rec.size == rid.size);
|
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
if(dirty) {
|
|
|
|
assert(lsn > *dirty);
|
2004-06-28 22:48:02 +00:00
|
|
|
*dirty = lsn; /* Updates value in triple hash (works because of pointer aliasing.) */
|
2004-06-28 21:10:10 +00:00
|
|
|
DEBUG("Blob already dirty.\n");
|
|
|
|
|
|
|
|
|
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
} else {
|
2004-06-28 21:10:10 +00:00
|
|
|
DEBUG("Marking blob dirty.\n");
|
2004-06-26 02:05:24 +00:00
|
|
|
tripleHashInsert(xid, rid, lsn);
|
2004-06-28 21:10:10 +00:00
|
|
|
/* Flip the fd bit on the record. */
|
|
|
|
rec.fd = rec.fd ? 0 : 1;
|
|
|
|
|
|
|
|
/* Tset() raw record */
|
2004-06-30 01:09:57 +00:00
|
|
|
writeRawRecord(xid, rid, &rec, sizeof(blob_record_t));
|
2004-06-26 02:05:24 +00:00
|
|
|
}
|
2004-06-28 21:10:10 +00:00
|
|
|
|
|
|
|
fd = rec.fd ? blobf1 : blobf0; /* rec's fd is up-to-date, so use it directly */
|
|
|
|
|
|
|
|
offset = myFseek(fd, rec.offset, SEEK_SET);
|
|
|
|
|
2004-07-04 00:46:49 +00:00
|
|
|
DEBUG("Writing at offset = %d, size = %ld\n", rec.offset, rec.size);
|
2004-06-28 21:10:10 +00:00
|
|
|
assert(offset == rec.offset);
|
|
|
|
readcount = fwrite(buf, rec.size, 1, fd);
|
|
|
|
assert(1 == readcount);
|
2004-06-26 02:05:24 +00:00
|
|
|
|
|
|
|
/* No need to update the raw blob record. */
|
|
|
|
|
|
|
|
}
|
2004-06-30 01:09:57 +00:00
|
|
|
/** @todo check to see if commitBlobs actually needs to flush blob
|
|
|
|
files when it's called (are there any dirty blobs associated with
|
|
|
|
this transaction? */
|
2004-06-28 21:10:10 +00:00
|
|
|
void commitBlobs(int xid) {
|
2004-06-30 01:09:57 +00:00
|
|
|
|
|
|
|
fdatasync(fileno(blobf0));
|
|
|
|
fdatasync(fileno(blobf1));
|
2004-06-28 21:10:10 +00:00
|
|
|
abortBlobs(xid);
|
|
|
|
}
|
2004-06-28 22:48:02 +00:00
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
/**
|
|
|
|
Just clean up the dirty list for this xid. @todo Check return values.
|
|
|
|
|
|
|
|
(Functionally equivalent to the old rmTouch() function. Just
|
|
|
|
deletes this xid's dirty list.)
|
2004-06-26 02:05:24 +00:00
|
|
|
|
2004-06-28 22:48:02 +00:00
|
|
|
@todo The tripleHash data structure is overkill here. We only
|
|
|
|
need two layers of hash tables, but it works, and it would be a
|
|
|
|
pain to change it, unless we need to touch this file for some
|
|
|
|
other reason.
|
2004-06-28 21:10:10 +00:00
|
|
|
|
|
|
|
*/
|
2004-06-26 02:05:24 +00:00
|
|
|
void abortBlobs(int xid) {
|
2004-06-28 22:48:02 +00:00
|
|
|
/*
|
|
|
|
At first glance, it may seem easier to keep track of which blobs
|
|
|
|
are dirty only in blobManager, and then propogate those updates to
|
|
|
|
bufferManager later. It turns out that it's much easier to
|
|
|
|
propogate the changes to bufferManger, since otherwise, recovery
|
|
|
|
and undo have to reason about lazy propogation of values to the
|
|
|
|
bufferManager, and also have to preserve *write* ordering, even
|
|
|
|
though the writes may be across many transactions, and could be
|
|
|
|
propogated in the wrong order. If we generate a Tset() (for the
|
|
|
|
blob record in bufferManager) for each write, things become much
|
|
|
|
easier.
|
|
|
|
*/
|
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
pblHashTable_t * rid_buckets = pblHtLookup(dirtyBlobs, &xid, sizeof(int));
|
|
|
|
pblHashTable_t * this_bucket;
|
|
|
|
|
2004-06-28 21:10:10 +00:00
|
|
|
if(!rid_buckets) { return; } /* No dirty blobs for this xid.. */
|
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
for(this_bucket = pblHtFirst(rid_buckets); this_bucket; this_bucket = pblHtNext(rid_buckets)) {
|
|
|
|
lsn_t * rid_lsn;
|
|
|
|
int page_number;
|
2004-06-28 22:48:02 +00:00
|
|
|
|
2004-06-26 02:05:24 +00:00
|
|
|
/* All right, this_bucket contains all of the rids for this page. */
|
|
|
|
|
|
|
|
for(rid_lsn = pblHtFirst(this_bucket); rid_lsn; rid_lsn = pblHtNext(this_bucket)) {
|
|
|
|
recordid * rid = pblHtCurrentKey(this_bucket);
|
2004-06-28 21:10:10 +00:00
|
|
|
page_number = rid->page;
|
2004-06-26 02:05:24 +00:00
|
|
|
pblHtRemove(this_bucket, rid, sizeof(recordid));
|
|
|
|
free(rid_lsn);
|
|
|
|
}
|
|
|
|
|
|
|
|
pblHtRemove(rid_buckets, &page_number, sizeof(int));
|
|
|
|
pblHtDelete(this_bucket);
|
|
|
|
}
|
|
|
|
pblHtDelete(rid_buckets);
|
|
|
|
|
2004-06-25 18:59:24 +00:00
|
|
|
}
|