/*------------------------------------------------------------------------- * * smgr.c * public interface routines to storage manager switch. * * All file system operations in POSTGRES dispatch through these * routines. * * Portions Copyright (c) 2006-2008, Greenplum inc * Portions Copyright (c) 2012-Present Pivotal Software, Inc. * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * src/backend/storage/smgr/smgr.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/xact.h" #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/indexing.h" #include "commands/tablespace.h" #include "postmaster/postmaster.h" #include "storage/bufmgr.h" #include "storage/ipc.h" #include "storage/smgr.h" #include "utils/faultinjector.h" #include "utils/hsearch.h" #include "utils/inval.h" /* * Each backend has a hashtable that stores all extant SMgrRelation objects. * In addition, "unowned" SMgrRelation objects are chained together in a list. */ static HTAB *SMgrRelationHash = NULL; static SMgrRelation first_unowned_reln = NULL; /* local function prototypes */ static void smgrshutdown(int code, Datum arg); static void add_to_unowned_list(SMgrRelation reln); static void remove_from_unowned_list(SMgrRelation reln); /* * smgrinit(), smgrshutdown() -- Initialize or shut down storage * managers. * * Note: smgrinit is called during backend startup (normal or standalone * case), *not* during postmaster start. Therefore, any resources created * here or destroyed in smgrshutdown are backend-local. */ void smgrinit(void) { mdinit(); /* register the shutdown proc */ on_proc_exit(smgrshutdown, 0); } /* * on_proc_exit hook for smgr cleanup during backend shutdown */ static void smgrshutdown(int code, Datum arg) { } /* * smgropen() -- Return an SMgrRelation object, creating it if need be. * * This does not attempt to actually open the underlying file. */ SMgrRelation smgropen(RelFileNode rnode, BackendId backend) { RelFileNodeBackend brnode; SMgrRelation reln; bool found; /* GPDB: don't support MyBackendId as a possible backend. */ Assert(backend == InvalidBackendId || backend == TempRelBackendId); if (SMgrRelationHash == NULL) { /* First time through: initialize the hash table */ HASHCTL ctl; MemSet(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(RelFileNodeBackend); ctl.entrysize = sizeof(SMgrRelationData); ctl.hash = tag_hash; SMgrRelationHash = hash_create("smgr relation table", 400, &ctl, HASH_ELEM | HASH_FUNCTION); first_unowned_reln = NULL; } /* Look up or create an entry */ brnode.node = rnode; brnode.backend = backend; reln = (SMgrRelation) hash_search(SMgrRelationHash, (void *) &brnode, HASH_ENTER, &found); /* Initialize it if not present before */ if (!found) { int forknum; /* hash_search already filled in the lookup key */ reln->smgr_owner = NULL; reln->smgr_targblock = InvalidBlockNumber; reln->smgr_fsm_nblocks = InvalidBlockNumber; reln->smgr_vm_nblocks = InvalidBlockNumber; reln->smgr_which = 0; /* we only have md.c at present */ /* mark it not open */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) reln->md_fd[forknum] = NULL; /* it has no owner yet */ add_to_unowned_list(reln); } return reln; } /* * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object * * There can be only one owner at a time; this is sufficient since currently * the only such owners exist in the relcache. */ void smgrsetowner(SMgrRelation *owner, SMgrRelation reln) { /* We don't support "disowning" an SMgrRelation here, use smgrclearowner */ Assert(owner != NULL); /* * First, unhook any old owner. (Normally there shouldn't be any, but it * seems possible that this can happen during swap_relation_files() * depending on the order of processing. It's ok to close the old * relcache entry early in that case.) * * If there isn't an old owner, then the reln should be in the unowned * list, and we need to remove it. */ if (reln->smgr_owner) *(reln->smgr_owner) = NULL; else remove_from_unowned_list(reln); /* Now establish the ownership relationship. */ reln->smgr_owner = owner; *owner = reln; } /* * smgrclearowner() -- Remove long-lived reference to an SMgrRelation object * if one exists */ void smgrclearowner(SMgrRelation *owner, SMgrRelation reln) { /* Do nothing if the SMgrRelation object is not owned by the owner */ if (reln->smgr_owner != owner) return; /* unset the owner's reference */ *owner = NULL; /* unset our reference to the owner */ reln->smgr_owner = NULL; add_to_unowned_list(reln); } /* * add_to_unowned_list -- link an SMgrRelation onto the unowned list * * Check remove_from_unowned_list()'s comments for performance * considerations. */ static void add_to_unowned_list(SMgrRelation reln) { /* place it at head of the list (to make smgrsetowner cheap) */ reln->next_unowned_reln = first_unowned_reln; first_unowned_reln = reln; } /* * remove_from_unowned_list -- unlink an SMgrRelation from the unowned list * * If the reln is not present in the list, nothing happens. Typically this * would be caller error, but there seems no reason to throw an error. * * In the worst case this could be rather slow; but in all the cases that seem * likely to be performance-critical, the reln being sought will actually be * first in the list. Furthermore, the number of unowned relns touched in any * one transaction shouldn't be all that high typically. So it doesn't seem * worth expending the additional space and management logic needed for a * doubly-linked list. */ static void remove_from_unowned_list(SMgrRelation reln) { SMgrRelation *link; SMgrRelation cur; for (link = &first_unowned_reln, cur = *link; cur != NULL; link = &cur->next_unowned_reln, cur = *link) { if (cur == reln) { *link = cur->next_unowned_reln; cur->next_unowned_reln = NULL; break; } } } /* * smgrexists() -- Does the underlying file for a fork exist? */ bool smgrexists(SMgrRelation reln, ForkNumber forknum) { return mdexists(reln, forknum); } /* * smgrclose() -- Close and delete an SMgrRelation object. */ void smgrclose(SMgrRelation reln) { SMgrRelation *owner; ForkNumber forknum; for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) mdclose(reln, forknum); owner = reln->smgr_owner; if (!owner) remove_from_unowned_list(reln); if (hash_search(SMgrRelationHash, (void *) &(reln->smgr_rnode), HASH_REMOVE, NULL) == NULL) elog(ERROR, "SMgrRelation hashtable corrupted"); /* * Unhook the owner pointer, if any. We do this last since in the remote * possibility of failure above, the SMgrRelation object will still exist. */ if (owner) *owner = NULL; } /* * smgrcloseall() -- Close all existing SMgrRelation objects. */ void smgrcloseall(void) { HASH_SEQ_STATUS status; SMgrRelation reln; /* Nothing to do if hashtable not set up */ if (SMgrRelationHash == NULL) return; hash_seq_init(&status, SMgrRelationHash); while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL) smgrclose(reln); } /* * smgrclosenode() -- Close SMgrRelation object for given RelFileNode, * if one exists. * * This has the same effects as smgrclose(smgropen(rnode)), but it avoids * uselessly creating a hashtable entry only to drop it again when no * such entry exists already. */ void smgrclosenode(RelFileNodeBackend rnode) { SMgrRelation reln; /* Nothing to do if hashtable not set up */ if (SMgrRelationHash == NULL) return; reln = (SMgrRelation) hash_search(SMgrRelationHash, (void *) &rnode, HASH_FIND, NULL); if (reln != NULL) smgrclose(reln); } /* * smgrcreate() -- Create a new relation. * * Given an already-created (but presumably unused) SMgrRelation, * cause the underlying disk file or other storage for the fork * to be created. * * If isRedo is true, it is okay for the underlying file to exist * already because we are in a WAL replay sequence. */ void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) { /* * Exit quickly in WAL replay mode if we've already opened the file. If * it's open, it surely must exist. */ if (isRedo && reln->md_fd[forknum] != NULL) return; /* * We may be using the target table space for the first time in this * database, so create a per-database subdirectory if needed. * * XXX this is a fairly ugly violation of module layering, but this seems * to be the best place to put the check. Maybe TablespaceCreateDbspace * should be here and not in commands/tablespace.c? But that would imply * importing a lot of stuff that smgr.c oughtn't know, either. */ TablespaceCreateDbspace(reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, isRedo); mdcreate(reln, forknum, isRedo); } /* * smgrcreate_ao() -- Create a new AO relation segment. * Given a RelFileNode, cause the underlying disk file for the * AO segment to be created. * * If isRedo is true, it is okay for the underlying file to exist * already because we are in a WAL replay sequence. */ void smgrcreate_ao(RelFileNodeBackend rnode, int32 segmentFileNum, bool isRedo) { mdcreate_ao(rnode, segmentFileNum, isRedo); } /* * smgrdounlink() -- Immediately unlink all forks of a relation. * * All forks of the relation are removed from the store. This should * not be used during transactional operations, since it can't be undone. * * If isRedo is true, it is okay for the underlying file(s) to be gone * already. */ void smgrdounlink(SMgrRelation reln, bool isRedo, char relstorage) { RelFileNodeBackend rnode = reln->smgr_rnode; ForkNumber forknum; /* Close the forks at smgr level */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) mdclose(reln, forknum); /* * Get rid of any remaining buffers for the relation. bufmgr will just * drop them without bothering to write the contents. * * Apart from relstorage == RELSTORAGE_HEAP do any other RELSTOARGE type * expected to have buffers in shared memory ? Can check only for * RELSTORAGE_HEAP below. */ if ((relstorage != RELSTORAGE_AOROWS) && (relstorage != RELSTORAGE_AOCOLS)) DropRelFileNodesAllBuffers(&rnode, 1); /* * It'd be nice to tell the stats collector to forget it immediately, too. * But we can't because we don't know the OID (and in cases involving * relfilenode swaps, it's not always clear which table OID to forget, * anyway). */ /* * Send a shared-inval message to force other backends to close any * dangling smgr references they may have for this rel. We should do this * before starting the actual unlinking, in case we fail partway through * that step. Note that the sinval message will eventually come back to * this backend, too, and thereby provide a backstop that we closed our * own smgr rel. */ CacheInvalidateSmgr(rnode); /* * Delete the physical file(s). * * Note: smgr_unlink must treat deletion failure as a WARNING, not an * ERROR, because we've already decided to commit or abort the current * xact. */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) mdunlink(rnode, forknum, isRedo, relstorage); } /* * smgrdounlinkall() -- Immediately unlink all forks of all given relations * * All forks of all given relations are removed from the store. This * should not be used during transactional operations, since it can't be * undone. * * If isRedo is true, it is okay for the underlying file(s) to be gone * already. * * This is equivalent to calling smgrdounlink for each relation, but it's * significantly quicker so should be preferred when possible. * * 'relstorages' is an array of pg_class.relstorage fields. It must have the * same size as 'rels'. */ void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo, char *relstorages) { int i = 0; RelFileNodeBackend *rnodes; ForkNumber forknum; bool has_heaps = false; if (nrels == 0) return; /* * create an array which contains all relations to be dropped, and close * each relation's forks at the smgr level while at it */ rnodes = palloc(sizeof(RelFileNodeBackend) * nrels); for (i = 0; i < nrels; i++) { RelFileNodeBackend rnode = rels[i]->smgr_rnode; rnodes[i] = rnode; /* Close the forks at smgr level */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) mdclose(rels[i], forknum); if ((relstorages[i] != RELSTORAGE_AOROWS) && (relstorages[i] != RELSTORAGE_AOCOLS)) has_heaps = true; } /* * Get rid of any remaining buffers for the relations. bufmgr will just * drop them without bothering to write the contents. */ if (has_heaps) DropRelFileNodesAllBuffers(rnodes, nrels); /* * It'd be nice to tell the stats collector to forget them immediately, * too. But we can't because we don't know the OIDs. */ /* * Send a shared-inval message to force other backends to close any * dangling smgr references they may have for these rels. We should do * this before starting the actual unlinking, in case we fail partway * through that step. Note that the sinval messages will eventually come * back to this backend, too, and thereby provide a backstop that we * closed our own smgr rel. */ for (i = 0; i < nrels; i++) CacheInvalidateSmgr(rnodes[i]); /* * Delete the physical file(s). * * Note: smgr_unlink must treat deletion failure as a WARNING, not an * ERROR, because we've already decided to commit or abort the current * xact. */ for (i = 0; i < nrels; i++) { for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) mdunlink(rnodes[i], forknum, isRedo, relstorages[i]); } pfree(rnodes); } /* * smgrextend() -- Add a new block to a file. * * The semantics are nearly the same as smgrwrite(): write at the * specified position. However, this is to be used for the case of * extending a relation (i.e., blocknum is at or beyond the current * EOF). Note that we assume writing a block beyond current EOF * causes intervening file space to become filled with zeroes. * failure we clean up by truncating. */ void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { mdextend(reln, forknum, blocknum, buffer, skipFsync); } /* * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation. */ void smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { mdprefetch(reln, forknum, blocknum); } /* * smgrread() -- read a particular block from a relation into the supplied * buffer. * * This routine is called from the buffer manager in order to * instantiate pages in the shared buffer cache. All storage managers * return pages in the format that POSTGRES expects. */ void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) { mdread(reln, forknum, blocknum, buffer); } /* * smgrwrite() -- Write the supplied buffer out. * * This is to be used only for updating already-existing blocks of a * relation (ie, those before the current EOF). To extend a relation, * use smgrextend(). * * This is not a synchronous write -- the block is not necessarily * on disk at return, only dumped out to the kernel. However, * provisions will be made to fsync the write before the next checkpoint. * * skipFsync indicates that the caller will make other provisions to * fsync the relation, so we needn't bother. Temporary relations also * do not require fsync. */ void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { mdwrite(reln, forknum, blocknum, buffer, skipFsync); } /* * smgrnblocks() -- Calculate the number of blocks in the * supplied relation. */ BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum) { return mdnblocks(reln, forknum); } /* * smgrtruncate() -- Truncate supplied relation to the specified number * of blocks * * The truncation is done immediately, so this can't be rolled back. */ void smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) { /* * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will * just drop them without bothering to write the contents. */ DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nblocks); /* * Send a shared-inval message to force other backends to close any smgr * references they may have for this rel. This is useful because they * might have open file pointers to segments that got removed, and/or * smgr_targblock variables pointing past the new rel end. (The inval * message will come back to our backend, too, causing a * probably-unnecessary local smgr flush. But we don't expect that this * is a performance-critical path.) As in the unlink code, we want to be * sure the message is sent before we start changing things on-disk. */ CacheInvalidateSmgr(reln->smgr_rnode); /* * Do the truncation. */ mdtruncate(reln, forknum, nblocks); } /* * smgrimmedsync() -- Force the specified relation to stable storage. * * Synchronously force all previous writes to the specified relation * down to disk. * * This is useful for building completely new relations (eg, new * indexes). Instead of incrementally WAL-logging the index build * steps, we can just write completed index pages to disk with smgrwrite * or smgrextend, and then fsync the completed index file before * committing the transaction. (This is sufficient for purposes of * crash recovery, since it effectively duplicates forcing a checkpoint * for the completed index. But it is *not* sufficient if one wishes * to use the WAL log for PITR or replication purposes: in that case * we have to make WAL entries as well.) * * The preceding writes should specify skipFsync = true to avoid * duplicative fsyncs. * * Note that you need to do FlushRelationBuffers() first if there is * any possibility that there are dirty buffers for the relation; * otherwise the sync is not very meaningful. */ void smgrimmedsync(SMgrRelation reln, ForkNumber forknum) { mdimmedsync(reln, forknum); } /* * smgrpreckpt() -- Prepare for checkpoint. */ void smgrpreckpt(void) { mdpreckpt(); } /* * smgrsync() -- Sync files to disk during checkpoint. */ void smgrsync(void) { mdsync(); } /* * smgrpostckpt() -- Post-checkpoint cleanup. */ void smgrpostckpt(void) { mdpostckpt(); } /* * AtEOXact_SMgr * * This routine is called during transaction commit or abort (it doesn't * particularly care which). All transient SMgrRelation objects are closed. * * We do this as a compromise between wanting transient SMgrRelations to * live awhile (to amortize the costs of blind writes of multiple blocks) * and needing them to not live forever (since we're probably holding open * a kernel file descriptor for the underlying file, and we need to ensure * that gets closed reasonably soon if the file gets deleted). */ void AtEOXact_SMgr(void) { /* * Zap all unowned SMgrRelations. We rely on smgrclose() to remove each * one from the list. */ while (first_unowned_reln != NULL) { Assert(first_unowned_reln->smgr_owner == NULL); smgrclose(first_unowned_reln); } }