Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F12910304
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
144 KB
Subscribers
None
View Options
diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc
index f43ad7679..3bb58e560 100644
--- a/src/leveldb/db/db_impl.cc
+++ b/src/leveldb/db/db_impl.cc
@@ -1,1568 +1,1568 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_impl.h"
#include <algorithm>
#include <set>
#include <string>
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include "db/builder.h"
#include "db/db_iter.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "leveldb/status.h"
#include "leveldb/table.h"
#include "leveldb/table_builder.h"
#include "port/port.h"
#include "table/block.h"
#include "table/merger.h"
#include "table/two_level_iterator.h"
#include "util/coding.h"
#include "util/logging.h"
#include "util/mutexlock.h"
namespace leveldb {
const int kNumNonTableCacheFiles = 10;
// Information kept for every waiting writer
struct DBImpl::Writer {
Status status;
WriteBatch* batch;
bool sync;
bool done;
port::CondVar cv;
explicit Writer(port::Mutex* mu) : cv(mu) { }
};
struct DBImpl::CompactionState {
Compaction* const compaction;
// Sequence numbers < smallest_snapshot are not significant since we
// will never have to service a snapshot below smallest_snapshot.
// Therefore if we have seen a sequence number S <= smallest_snapshot,
// we can drop all entries for the same key with sequence numbers < S.
SequenceNumber smallest_snapshot;
// Files produced by compaction
struct Output {
uint64_t number;
uint64_t file_size;
InternalKey smallest, largest;
};
std::vector<Output> outputs;
// State kept for output being generated
WritableFile* outfile;
TableBuilder* builder;
uint64_t total_bytes;
Output* current_output() { return &outputs[outputs.size()-1]; }
explicit CompactionState(Compaction* c)
: compaction(c),
outfile(NULL),
builder(NULL),
total_bytes(0) {
}
};
// Fix user-supplied options to be reasonable
template <class T,class V>
static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
}
Options SanitizeOptions(const std::string& dbname,
const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const Options& src) {
Options result = src;
result.comparator = icmp;
result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
ClipToRange(&result.max_open_files, 64 + kNumNonTableCacheFiles, 50000);
ClipToRange(&result.write_buffer_size, 64<<10, 1<<30);
ClipToRange(&result.max_file_size, 1<<20, 1<<30);
ClipToRange(&result.block_size, 1<<10, 4<<20);
if (result.info_log == NULL) {
// Open a log file in the same directory as the db
src.env->CreateDir(dbname); // In case it does not exist
src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
if (!s.ok()) {
// No place suitable for logging
result.info_log = NULL;
}
}
if (result.block_cache == NULL) {
result.block_cache = NewLRUCache(8 << 20);
}
return result;
}
DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
: env_(raw_options.env),
internal_comparator_(raw_options.comparator),
internal_filter_policy_(raw_options.filter_policy),
options_(SanitizeOptions(dbname, &internal_comparator_,
&internal_filter_policy_, raw_options)),
owns_info_log_(options_.info_log != raw_options.info_log),
owns_cache_(options_.block_cache != raw_options.block_cache),
dbname_(dbname),
db_lock_(NULL),
shutting_down_(NULL),
bg_cv_(&mutex_),
mem_(NULL),
imm_(NULL),
logfile_(NULL),
logfile_number_(0),
log_(NULL),
seed_(0),
tmp_batch_(new WriteBatch),
bg_compaction_scheduled_(false),
manual_compaction_(NULL) {
has_imm_.Release_Store(NULL);
// Reserve ten files or so for other uses and give the rest to TableCache.
const int table_cache_size = options_.max_open_files - kNumNonTableCacheFiles;
table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
versions_ = new VersionSet(dbname_, &options_, table_cache_,
&internal_comparator_);
}
DBImpl::~DBImpl() {
// Wait for background work to finish
mutex_.Lock();
shutting_down_.Release_Store(this); // Any non-NULL value is ok
while (bg_compaction_scheduled_) {
bg_cv_.Wait();
}
mutex_.Unlock();
if (db_lock_ != NULL) {
env_->UnlockFile(db_lock_);
}
delete versions_;
if (mem_ != NULL) mem_->Unref();
if (imm_ != NULL) imm_->Unref();
delete tmp_batch_;
delete log_;
delete logfile_;
delete table_cache_;
if (owns_info_log_) {
delete options_.info_log;
}
if (owns_cache_) {
delete options_.block_cache;
}
}
Status DBImpl::NewDB() {
VersionEdit new_db;
new_db.SetComparatorName(user_comparator()->Name());
new_db.SetLogNumber(0);
new_db.SetNextFile(2);
new_db.SetLastSequence(0);
const std::string manifest = DescriptorFileName(dbname_, 1);
WritableFile* file;
Status s = env_->NewWritableFile(manifest, &file);
if (!s.ok()) {
return s;
}
{
log::Writer log(file);
std::string record;
new_db.EncodeTo(&record);
s = log.AddRecord(record);
if (s.ok()) {
s = file->Close();
}
}
delete file;
if (s.ok()) {
// Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(env_, dbname_, 1);
} else {
env_->DeleteFile(manifest);
}
return s;
}
void DBImpl::MaybeIgnoreError(Status* s) const {
if (s->ok() || options_.paranoid_checks) {
// No change needed
} else {
Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
*s = Status::OK();
}
}
void DBImpl::DeleteObsoleteFiles() {
if (!bg_error_.ok()) {
// After a background error, we don't know whether a new version may
// or may not have been committed, so we cannot safely garbage collect.
return;
}
// Make a set of all of the live files
std::set<uint64_t> live = pending_outputs_;
versions_->AddLiveFiles(&live);
std::vector<std::string> filenames;
env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) {
bool keep = true;
switch (type) {
case kLogFile:
keep = ((number >= versions_->LogNumber()) ||
(number == versions_->PrevLogNumber()));
break;
case kDescriptorFile:
// Keep my manifest file, and any newer incarnations'
// (in case there is a race that allows other incarnations)
keep = (number >= versions_->ManifestFileNumber());
break;
case kTableFile:
keep = (live.find(number) != live.end());
break;
case kTempFile:
// Any temp files that are currently being written to must
// be recorded in pending_outputs_, which is inserted into "live"
keep = (live.find(number) != live.end());
break;
case kCurrentFile:
case kDBLockFile:
case kInfoLogFile:
keep = true;
break;
}
if (!keep) {
if (type == kTableFile) {
table_cache_->Evict(number);
}
Log(options_.info_log, "Delete type=%d #%lld\n",
int(type),
static_cast<unsigned long long>(number));
env_->DeleteFile(dbname_ + "/" + filenames[i]);
}
}
}
}
Status DBImpl::Recover(VersionEdit* edit, bool *save_manifest) {
mutex_.AssertHeld();
// Ignore error from CreateDir since the creation of the DB is
// committed only when the descriptor is created, and this directory
// may already exist from a previous failed creation attempt.
env_->CreateDir(dbname_);
assert(db_lock_ == NULL);
Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
if (!s.ok()) {
return s;
}
if (!env_->FileExists(CurrentFileName(dbname_))) {
if (options_.create_if_missing) {
s = NewDB();
if (!s.ok()) {
return s;
}
} else {
return Status::InvalidArgument(
dbname_, "does not exist (create_if_missing is false)");
}
} else {
if (options_.error_if_exists) {
return Status::InvalidArgument(
dbname_, "exists (error_if_exists is true)");
}
}
s = versions_->Recover(save_manifest);
if (!s.ok()) {
return s;
}
SequenceNumber max_sequence(0);
// Recover from all newer log files than the ones named in the
// descriptor (new log files may have been added by the previous
// incarnation without registering them in the descriptor).
//
// Note that PrevLogNumber() is no longer used, but we pay
// attention to it in case we are recovering a database
// produced by an older version of leveldb.
const uint64_t min_log = versions_->LogNumber();
const uint64_t prev_log = versions_->PrevLogNumber();
std::vector<std::string> filenames;
s = env_->GetChildren(dbname_, &filenames);
if (!s.ok()) {
return s;
}
std::set<uint64_t> expected;
versions_->AddLiveFiles(&expected);
uint64_t number;
FileType type;
std::vector<uint64_t> logs;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) {
expected.erase(number);
if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
logs.push_back(number);
}
}
if (!expected.empty()) {
char buf[50];
snprintf(buf, sizeof(buf), "%d missing files; e.g.",
static_cast<int>(expected.size()));
return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
}
// Recover in the order in which the logs were generated
std::sort(logs.begin(), logs.end());
for (size_t i = 0; i < logs.size(); i++) {
s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit,
&max_sequence);
if (!s.ok()) {
return s;
}
// The previous incarnation may not have written any MANIFEST
// records after allocating this log number. So we manually
// update the file number allocation counter in VersionSet.
versions_->MarkFileNumberUsed(logs[i]);
}
if (versions_->LastSequence() < max_sequence) {
versions_->SetLastSequence(max_sequence);
}
return Status::OK();
}
Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
bool* save_manifest, VersionEdit* edit,
SequenceNumber* max_sequence) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
Logger* info_log;
const char* fname;
Status* status; // NULL if options_.paranoid_checks==false
virtual void Corruption(size_t bytes, const Status& s) {
Log(info_log, "%s%s: dropping %d bytes; %s",
(this->status == NULL ? "(ignoring error) " : ""),
fname, static_cast<int>(bytes), s.ToString().c_str());
if (this->status != NULL && this->status->ok()) *this->status = s;
}
};
mutex_.AssertHeld();
// Open the log file
std::string fname = LogFileName(dbname_, log_number);
SequentialFile* file;
Status status = env_->NewSequentialFile(fname, &file);
if (!status.ok()) {
MaybeIgnoreError(&status);
return status;
}
// Create the log reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.fname = fname.c_str();
reporter.status = (options_.paranoid_checks ? &status : NULL);
// We intentionally make log::Reader do checksumming even if
// paranoid_checks==false so that corruptions cause entire commits
// to be skipped instead of propagating bad information (like overly
// large sequence numbers).
log::Reader reader(file, &reporter, true/*checksum*/,
0/*initial_offset*/);
Log(options_.info_log, "Recovering log #%llu",
(unsigned long long) log_number);
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
int compactions = 0;
MemTable* mem = NULL;
while (reader.ReadRecord(&record, &scratch) &&
status.ok()) {
if (record.size() < 12) {
reporter.Corruption(
- record.size(), Status::Corruption("log record too small"));
+ record.size(), Status::Corruption("log record too small", fname));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
if (mem == NULL) {
mem = new MemTable(internal_comparator_);
mem->Ref();
}
status = WriteBatchInternal::InsertInto(&batch, mem);
MaybeIgnoreError(&status);
if (!status.ok()) {
break;
}
const SequenceNumber last_seq =
WriteBatchInternal::Sequence(&batch) +
WriteBatchInternal::Count(&batch) - 1;
if (last_seq > *max_sequence) {
*max_sequence = last_seq;
}
if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
compactions++;
*save_manifest = true;
status = WriteLevel0Table(mem, edit, NULL);
mem->Unref();
mem = NULL;
if (!status.ok()) {
// Reflect errors immediately so that conditions like full
// file-systems cause the DB::Open() to fail.
break;
}
}
}
delete file;
// See if we should keep reusing the last log file.
if (status.ok() && options_.reuse_logs && last_log && compactions == 0) {
assert(logfile_ == NULL);
assert(log_ == NULL);
assert(mem_ == NULL);
uint64_t lfile_size;
if (env_->GetFileSize(fname, &lfile_size).ok() &&
env_->NewAppendableFile(fname, &logfile_).ok()) {
Log(options_.info_log, "Reusing old log %s \n", fname.c_str());
log_ = new log::Writer(logfile_, lfile_size);
logfile_number_ = log_number;
if (mem != NULL) {
mem_ = mem;
mem = NULL;
} else {
// mem can be NULL if lognum exists but was empty.
mem_ = new MemTable(internal_comparator_);
mem_->Ref();
}
}
}
if (mem != NULL) {
// mem did not get reused; compact it.
if (status.ok()) {
*save_manifest = true;
status = WriteLevel0Table(mem, edit, NULL);
}
mem->Unref();
}
return status;
}
Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
Version* base) {
mutex_.AssertHeld();
const uint64_t start_micros = env_->NowMicros();
FileMetaData meta;
meta.number = versions_->NewFileNumber();
pending_outputs_.insert(meta.number);
Iterator* iter = mem->NewIterator();
Log(options_.info_log, "Level-0 table #%llu: started",
(unsigned long long) meta.number);
Status s;
{
mutex_.Unlock();
s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
mutex_.Lock();
}
Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
(unsigned long long) meta.number,
(unsigned long long) meta.file_size,
s.ToString().c_str());
delete iter;
pending_outputs_.erase(meta.number);
// Note that if file_size is zero, the file has been deleted and
// should not be added to the manifest.
int level = 0;
if (s.ok() && meta.file_size > 0) {
const Slice min_user_key = meta.smallest.user_key();
const Slice max_user_key = meta.largest.user_key();
if (base != NULL) {
level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
}
edit->AddFile(level, meta.number, meta.file_size,
meta.smallest, meta.largest);
}
CompactionStats stats;
stats.micros = env_->NowMicros() - start_micros;
stats.bytes_written = meta.file_size;
stats_[level].Add(stats);
return s;
}
void DBImpl::CompactMemTable() {
mutex_.AssertHeld();
assert(imm_ != NULL);
// Save the contents of the memtable as a new Table
VersionEdit edit;
Version* base = versions_->current();
base->Ref();
Status s = WriteLevel0Table(imm_, &edit, base);
base->Unref();
if (s.ok() && shutting_down_.Acquire_Load()) {
s = Status::IOError("Deleting DB during memtable compaction");
}
// Replace immutable memtable with the generated Table
if (s.ok()) {
edit.SetPrevLogNumber(0);
edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
s = versions_->LogAndApply(&edit, &mutex_);
}
if (s.ok()) {
// Commit to the new state
imm_->Unref();
imm_ = NULL;
has_imm_.Release_Store(NULL);
DeleteObsoleteFiles();
} else {
RecordBackgroundError(s);
}
}
void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
int max_level_with_files = 1;
{
MutexLock l(&mutex_);
Version* base = versions_->current();
for (int level = 1; level < config::kNumLevels; level++) {
if (base->OverlapInLevel(level, begin, end)) {
max_level_with_files = level;
}
}
}
TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
for (int level = 0; level < max_level_with_files; level++) {
TEST_CompactRange(level, begin, end);
}
}
void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
assert(level >= 0);
assert(level + 1 < config::kNumLevels);
InternalKey begin_storage, end_storage;
ManualCompaction manual;
manual.level = level;
manual.done = false;
if (begin == NULL) {
manual.begin = NULL;
} else {
begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
manual.begin = &begin_storage;
}
if (end == NULL) {
manual.end = NULL;
} else {
end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
manual.end = &end_storage;
}
MutexLock l(&mutex_);
while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
if (manual_compaction_ == NULL) { // Idle
manual_compaction_ = &manual;
MaybeScheduleCompaction();
} else { // Running either my compaction or another compaction.
bg_cv_.Wait();
}
}
if (manual_compaction_ == &manual) {
// Cancel my manual compaction since we aborted early for some reason.
manual_compaction_ = NULL;
}
}
Status DBImpl::TEST_CompactMemTable() {
// NULL batch means just wait for earlier writes to be done
Status s = Write(WriteOptions(), NULL);
if (s.ok()) {
// Wait until the compaction completes
MutexLock l(&mutex_);
while (imm_ != NULL && bg_error_.ok()) {
bg_cv_.Wait();
}
if (imm_ != NULL) {
s = bg_error_;
}
}
return s;
}
void DBImpl::RecordBackgroundError(const Status& s) {
mutex_.AssertHeld();
if (bg_error_.ok()) {
bg_error_ = s;
bg_cv_.SignalAll();
}
}
void DBImpl::MaybeScheduleCompaction() {
mutex_.AssertHeld();
if (bg_compaction_scheduled_) {
// Already scheduled
} else if (shutting_down_.Acquire_Load()) {
// DB is being deleted; no more background compactions
} else if (!bg_error_.ok()) {
// Already got an error; no more changes
} else if (imm_ == NULL &&
manual_compaction_ == NULL &&
!versions_->NeedsCompaction()) {
// No work to be done
} else {
bg_compaction_scheduled_ = true;
env_->Schedule(&DBImpl::BGWork, this);
}
}
void DBImpl::BGWork(void* db) {
reinterpret_cast<DBImpl*>(db)->BackgroundCall();
}
void DBImpl::BackgroundCall() {
MutexLock l(&mutex_);
assert(bg_compaction_scheduled_);
if (shutting_down_.Acquire_Load()) {
// No more background work when shutting down.
} else if (!bg_error_.ok()) {
// No more background work after a background error.
} else {
BackgroundCompaction();
}
bg_compaction_scheduled_ = false;
// Previous compaction may have produced too many files in a level,
// so reschedule another compaction if needed.
MaybeScheduleCompaction();
bg_cv_.SignalAll();
}
void DBImpl::BackgroundCompaction() {
mutex_.AssertHeld();
if (imm_ != NULL) {
CompactMemTable();
return;
}
Compaction* c;
bool is_manual = (manual_compaction_ != NULL);
InternalKey manual_end;
if (is_manual) {
ManualCompaction* m = manual_compaction_;
c = versions_->CompactRange(m->level, m->begin, m->end);
m->done = (c == NULL);
if (c != NULL) {
manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
}
Log(options_.info_log,
"Manual compaction at level-%d from %s .. %s; will stop at %s\n",
m->level,
(m->begin ? m->begin->DebugString().c_str() : "(begin)"),
(m->end ? m->end->DebugString().c_str() : "(end)"),
(m->done ? "(end)" : manual_end.DebugString().c_str()));
} else {
c = versions_->PickCompaction();
}
Status status;
if (c == NULL) {
// Nothing to do
} else if (!is_manual && c->IsTrivialMove()) {
// Move file to next level
assert(c->num_input_files(0) == 1);
FileMetaData* f = c->input(0, 0);
c->edit()->DeleteFile(c->level(), f->number);
c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
f->smallest, f->largest);
status = versions_->LogAndApply(c->edit(), &mutex_);
if (!status.ok()) {
RecordBackgroundError(status);
}
VersionSet::LevelSummaryStorage tmp;
Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
static_cast<unsigned long long>(f->number),
c->level() + 1,
static_cast<unsigned long long>(f->file_size),
status.ToString().c_str(),
versions_->LevelSummary(&tmp));
} else {
CompactionState* compact = new CompactionState(c);
status = DoCompactionWork(compact);
if (!status.ok()) {
RecordBackgroundError(status);
}
CleanupCompaction(compact);
c->ReleaseInputs();
DeleteObsoleteFiles();
}
delete c;
if (status.ok()) {
// Done
} else if (shutting_down_.Acquire_Load()) {
// Ignore compaction errors found during shutting down
} else {
Log(options_.info_log,
"Compaction error: %s", status.ToString().c_str());
}
if (is_manual) {
ManualCompaction* m = manual_compaction_;
if (!status.ok()) {
m->done = true;
}
if (!m->done) {
// We only compacted part of the requested range. Update *m
// to the range that is left to be compacted.
m->tmp_storage = manual_end;
m->begin = &m->tmp_storage;
}
manual_compaction_ = NULL;
}
}
void DBImpl::CleanupCompaction(CompactionState* compact) {
mutex_.AssertHeld();
if (compact->builder != NULL) {
// May happen if we get a shutdown call in the middle of compaction
compact->builder->Abandon();
delete compact->builder;
} else {
assert(compact->outfile == NULL);
}
delete compact->outfile;
for (size_t i = 0; i < compact->outputs.size(); i++) {
const CompactionState::Output& out = compact->outputs[i];
pending_outputs_.erase(out.number);
}
delete compact;
}
Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
assert(compact != NULL);
assert(compact->builder == NULL);
uint64_t file_number;
{
mutex_.Lock();
file_number = versions_->NewFileNumber();
pending_outputs_.insert(file_number);
CompactionState::Output out;
out.number = file_number;
out.smallest.Clear();
out.largest.Clear();
compact->outputs.push_back(out);
mutex_.Unlock();
}
// Make the output file
std::string fname = TableFileName(dbname_, file_number);
Status s = env_->NewWritableFile(fname, &compact->outfile);
if (s.ok()) {
compact->builder = new TableBuilder(options_, compact->outfile);
}
return s;
}
Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
Iterator* input) {
assert(compact != NULL);
assert(compact->outfile != NULL);
assert(compact->builder != NULL);
const uint64_t output_number = compact->current_output()->number;
assert(output_number != 0);
// Check for iterator errors
Status s = input->status();
const uint64_t current_entries = compact->builder->NumEntries();
if (s.ok()) {
s = compact->builder->Finish();
} else {
compact->builder->Abandon();
}
const uint64_t current_bytes = compact->builder->FileSize();
compact->current_output()->file_size = current_bytes;
compact->total_bytes += current_bytes;
delete compact->builder;
compact->builder = NULL;
// Finish and check for file errors
if (s.ok()) {
s = compact->outfile->Sync();
}
if (s.ok()) {
s = compact->outfile->Close();
}
delete compact->outfile;
compact->outfile = NULL;
if (s.ok() && current_entries > 0) {
// Verify that the table is usable
Iterator* iter = table_cache_->NewIterator(ReadOptions(),
output_number,
current_bytes);
s = iter->status();
delete iter;
if (s.ok()) {
Log(options_.info_log,
"Generated table #%llu@%d: %lld keys, %lld bytes",
(unsigned long long) output_number,
compact->compaction->level(),
(unsigned long long) current_entries,
(unsigned long long) current_bytes);
}
}
return s;
}
Status DBImpl::InstallCompactionResults(CompactionState* compact) {
mutex_.AssertHeld();
Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
compact->compaction->num_input_files(0),
compact->compaction->level(),
compact->compaction->num_input_files(1),
compact->compaction->level() + 1,
static_cast<long long>(compact->total_bytes));
// Add compaction outputs
compact->compaction->AddInputDeletions(compact->compaction->edit());
const int level = compact->compaction->level();
for (size_t i = 0; i < compact->outputs.size(); i++) {
const CompactionState::Output& out = compact->outputs[i];
compact->compaction->edit()->AddFile(
level + 1,
out.number, out.file_size, out.smallest, out.largest);
}
return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
}
Status DBImpl::DoCompactionWork(CompactionState* compact) {
const uint64_t start_micros = env_->NowMicros();
int64_t imm_micros = 0; // Micros spent doing imm_ compactions
Log(options_.info_log, "Compacting %d@%d + %d@%d files",
compact->compaction->num_input_files(0),
compact->compaction->level(),
compact->compaction->num_input_files(1),
compact->compaction->level() + 1);
assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
assert(compact->builder == NULL);
assert(compact->outfile == NULL);
if (snapshots_.empty()) {
compact->smallest_snapshot = versions_->LastSequence();
} else {
compact->smallest_snapshot = snapshots_.oldest()->number_;
}
// Release mutex while we're actually doing the compaction work
mutex_.Unlock();
Iterator* input = versions_->MakeInputIterator(compact->compaction);
input->SeekToFirst();
Status status;
ParsedInternalKey ikey;
std::string current_user_key;
bool has_current_user_key = false;
SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
// Prioritize immutable compaction work
if (has_imm_.NoBarrier_Load() != NULL) {
const uint64_t imm_start = env_->NowMicros();
mutex_.Lock();
if (imm_ != NULL) {
CompactMemTable();
bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
}
mutex_.Unlock();
imm_micros += (env_->NowMicros() - imm_start);
}
Slice key = input->key();
if (compact->compaction->ShouldStopBefore(key) &&
compact->builder != NULL) {
status = FinishCompactionOutputFile(compact, input);
if (!status.ok()) {
break;
}
}
// Handle key/value, add to state, etc.
bool drop = false;
if (!ParseInternalKey(key, &ikey)) {
// Do not hide error keys
current_user_key.clear();
has_current_user_key = false;
last_sequence_for_key = kMaxSequenceNumber;
} else {
if (!has_current_user_key ||
user_comparator()->Compare(ikey.user_key,
Slice(current_user_key)) != 0) {
// First occurrence of this user key
current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
has_current_user_key = true;
last_sequence_for_key = kMaxSequenceNumber;
}
if (last_sequence_for_key <= compact->smallest_snapshot) {
// Hidden by an newer entry for same user key
drop = true; // (A)
} else if (ikey.type == kTypeDeletion &&
ikey.sequence <= compact->smallest_snapshot &&
compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
// For this user key:
// (1) there is no data in higher levels
// (2) data in lower levels will have larger sequence numbers
// (3) data in layers that are being compacted here and have
// smaller sequence numbers will be dropped in the next
// few iterations of this loop (by rule (A) above).
// Therefore this deletion marker is obsolete and can be dropped.
drop = true;
}
last_sequence_for_key = ikey.sequence;
}
#if 0
Log(options_.info_log,
" Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
"%d smallest_snapshot: %d",
ikey.user_key.ToString().c_str(),
(int)ikey.sequence, ikey.type, kTypeValue, drop,
compact->compaction->IsBaseLevelForKey(ikey.user_key),
(int)last_sequence_for_key, (int)compact->smallest_snapshot);
#endif
if (!drop) {
// Open output file if necessary
if (compact->builder == NULL) {
status = OpenCompactionOutputFile(compact);
if (!status.ok()) {
break;
}
}
if (compact->builder->NumEntries() == 0) {
compact->current_output()->smallest.DecodeFrom(key);
}
compact->current_output()->largest.DecodeFrom(key);
compact->builder->Add(key, input->value());
// Close output file if it is big enough
if (compact->builder->FileSize() >=
compact->compaction->MaxOutputFileSize()) {
status = FinishCompactionOutputFile(compact, input);
if (!status.ok()) {
break;
}
}
}
input->Next();
}
if (status.ok() && shutting_down_.Acquire_Load()) {
status = Status::IOError("Deleting DB during compaction");
}
if (status.ok() && compact->builder != NULL) {
status = FinishCompactionOutputFile(compact, input);
}
if (status.ok()) {
status = input->status();
}
delete input;
input = NULL;
CompactionStats stats;
stats.micros = env_->NowMicros() - start_micros - imm_micros;
for (int which = 0; which < 2; which++) {
for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
stats.bytes_read += compact->compaction->input(which, i)->file_size;
}
}
for (size_t i = 0; i < compact->outputs.size(); i++) {
stats.bytes_written += compact->outputs[i].file_size;
}
mutex_.Lock();
stats_[compact->compaction->level() + 1].Add(stats);
if (status.ok()) {
status = InstallCompactionResults(compact);
}
if (!status.ok()) {
RecordBackgroundError(status);
}
VersionSet::LevelSummaryStorage tmp;
Log(options_.info_log,
"compacted to: %s", versions_->LevelSummary(&tmp));
return status;
}
namespace {
struct IterState {
port::Mutex* mu;
Version* version;
MemTable* mem;
MemTable* imm;
};
static void CleanupIteratorState(void* arg1, void* arg2) {
IterState* state = reinterpret_cast<IterState*>(arg1);
state->mu->Lock();
state->mem->Unref();
if (state->imm != NULL) state->imm->Unref();
state->version->Unref();
state->mu->Unlock();
delete state;
}
} // namespace
Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
SequenceNumber* latest_snapshot,
uint32_t* seed) {
IterState* cleanup = new IterState;
mutex_.Lock();
*latest_snapshot = versions_->LastSequence();
// Collect together all needed child iterators
std::vector<Iterator*> list;
list.push_back(mem_->NewIterator());
mem_->Ref();
if (imm_ != NULL) {
list.push_back(imm_->NewIterator());
imm_->Ref();
}
versions_->current()->AddIterators(options, &list);
Iterator* internal_iter =
NewMergingIterator(&internal_comparator_, &list[0], list.size());
versions_->current()->Ref();
cleanup->mu = &mutex_;
cleanup->mem = mem_;
cleanup->imm = imm_;
cleanup->version = versions_->current();
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);
*seed = ++seed_;
mutex_.Unlock();
return internal_iter;
}
Iterator* DBImpl::TEST_NewInternalIterator() {
SequenceNumber ignored;
uint32_t ignored_seed;
return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed);
}
int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
MutexLock l(&mutex_);
return versions_->MaxNextLevelOverlappingBytes();
}
Status DBImpl::Get(const ReadOptions& options,
const Slice& key,
std::string* value) {
Status s;
MutexLock l(&mutex_);
SequenceNumber snapshot;
if (options.snapshot != NULL) {
snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
} else {
snapshot = versions_->LastSequence();
}
MemTable* mem = mem_;
MemTable* imm = imm_;
Version* current = versions_->current();
mem->Ref();
if (imm != NULL) imm->Ref();
current->Ref();
bool have_stat_update = false;
Version::GetStats stats;
// Unlock while reading from files and memtables
{
mutex_.Unlock();
// First look in the memtable, then in the immutable memtable (if any).
LookupKey lkey(key, snapshot);
if (mem->Get(lkey, value, &s)) {
// Done
} else if (imm != NULL && imm->Get(lkey, value, &s)) {
// Done
} else {
s = current->Get(options, lkey, value, &stats);
have_stat_update = true;
}
mutex_.Lock();
}
if (have_stat_update && current->UpdateStats(stats)) {
MaybeScheduleCompaction();
}
mem->Unref();
if (imm != NULL) imm->Unref();
current->Unref();
return s;
}
Iterator* DBImpl::NewIterator(const ReadOptions& options) {
SequenceNumber latest_snapshot;
uint32_t seed;
Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed);
return NewDBIterator(
this, user_comparator(), iter,
(options.snapshot != NULL
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
: latest_snapshot),
seed);
}
void DBImpl::RecordReadSample(Slice key) {
MutexLock l(&mutex_);
if (versions_->current()->RecordReadSample(key)) {
MaybeScheduleCompaction();
}
}
const Snapshot* DBImpl::GetSnapshot() {
MutexLock l(&mutex_);
return snapshots_.New(versions_->LastSequence());
}
void DBImpl::ReleaseSnapshot(const Snapshot* s) {
MutexLock l(&mutex_);
snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
}
// Convenience methods
Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
return DB::Put(o, key, val);
}
Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
return DB::Delete(options, key);
}
Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
Writer w(&mutex_);
w.batch = my_batch;
w.sync = options.sync;
w.done = false;
MutexLock l(&mutex_);
writers_.push_back(&w);
while (!w.done && &w != writers_.front()) {
w.cv.Wait();
}
if (w.done) {
return w.status;
}
// May temporarily unlock and wait.
Status status = MakeRoomForWrite(my_batch == NULL);
uint64_t last_sequence = versions_->LastSequence();
Writer* last_writer = &w;
if (status.ok() && my_batch != NULL) { // NULL batch is for compactions
WriteBatch* updates = BuildBatchGroup(&last_writer);
WriteBatchInternal::SetSequence(updates, last_sequence + 1);
last_sequence += WriteBatchInternal::Count(updates);
// Add to log and apply to memtable. We can release the lock
// during this phase since &w is currently responsible for logging
// and protects against concurrent loggers and concurrent writes
// into mem_.
{
mutex_.Unlock();
status = log_->AddRecord(WriteBatchInternal::Contents(updates));
bool sync_error = false;
if (status.ok() && options.sync) {
status = logfile_->Sync();
if (!status.ok()) {
sync_error = true;
}
}
if (status.ok()) {
status = WriteBatchInternal::InsertInto(updates, mem_);
}
mutex_.Lock();
if (sync_error) {
// The state of the log file is indeterminate: the log record we
// just added may or may not show up when the DB is re-opened.
// So we force the DB into a mode where all future writes fail.
RecordBackgroundError(status);
}
}
if (updates == tmp_batch_) tmp_batch_->Clear();
versions_->SetLastSequence(last_sequence);
}
while (true) {
Writer* ready = writers_.front();
writers_.pop_front();
if (ready != &w) {
ready->status = status;
ready->done = true;
ready->cv.Signal();
}
if (ready == last_writer) break;
}
// Notify new head of write queue
if (!writers_.empty()) {
writers_.front()->cv.Signal();
}
return status;
}
// REQUIRES: Writer list must be non-empty
// REQUIRES: First writer must have a non-NULL batch
WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
assert(!writers_.empty());
Writer* first = writers_.front();
WriteBatch* result = first->batch;
assert(result != NULL);
size_t size = WriteBatchInternal::ByteSize(first->batch);
// Allow the group to grow up to a maximum size, but if the
// original write is small, limit the growth so we do not slow
// down the small write too much.
size_t max_size = 1 << 20;
if (size <= (128<<10)) {
max_size = size + (128<<10);
}
*last_writer = first;
std::deque<Writer*>::iterator iter = writers_.begin();
++iter; // Advance past "first"
for (; iter != writers_.end(); ++iter) {
Writer* w = *iter;
if (w->sync && !first->sync) {
// Do not include a sync write into a batch handled by a non-sync write.
break;
}
if (w->batch != NULL) {
size += WriteBatchInternal::ByteSize(w->batch);
if (size > max_size) {
// Do not make batch too big
break;
}
// Append to *result
if (result == first->batch) {
// Switch to temporary batch instead of disturbing caller's batch
result = tmp_batch_;
assert(WriteBatchInternal::Count(result) == 0);
WriteBatchInternal::Append(result, first->batch);
}
WriteBatchInternal::Append(result, w->batch);
}
*last_writer = w;
}
return result;
}
// REQUIRES: mutex_ is held
// REQUIRES: this thread is currently at the front of the writer queue
Status DBImpl::MakeRoomForWrite(bool force) {
mutex_.AssertHeld();
assert(!writers_.empty());
bool allow_delay = !force;
Status s;
while (true) {
if (!bg_error_.ok()) {
// Yield previous error
s = bg_error_;
break;
} else if (
allow_delay &&
versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
// We are getting close to hitting a hard limit on the number of
// L0 files. Rather than delaying a single write by several
// seconds when we hit the hard limit, start delaying each
// individual write by 1ms to reduce latency variance. Also,
// this delay hands over some CPU to the compaction thread in
// case it is sharing the same core as the writer.
mutex_.Unlock();
env_->SleepForMicroseconds(1000);
allow_delay = false; // Do not delay a single write more than once
mutex_.Lock();
} else if (!force &&
(mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
// There is room in current memtable
break;
} else if (imm_ != NULL) {
// We have filled up the current memtable, but the previous
// one is still being compacted, so we wait.
Log(options_.info_log, "Current memtable full; waiting...\n");
bg_cv_.Wait();
} else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
// There are too many level-0 files.
Log(options_.info_log, "Too many L0 files; waiting...\n");
bg_cv_.Wait();
} else {
// Attempt to switch to a new memtable and trigger compaction of old
assert(versions_->PrevLogNumber() == 0);
uint64_t new_log_number = versions_->NewFileNumber();
WritableFile* lfile = NULL;
s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
if (!s.ok()) {
// Avoid chewing through file number space in a tight loop.
versions_->ReuseFileNumber(new_log_number);
break;
}
delete log_;
delete logfile_;
logfile_ = lfile;
logfile_number_ = new_log_number;
log_ = new log::Writer(lfile);
imm_ = mem_;
has_imm_.Release_Store(imm_);
mem_ = new MemTable(internal_comparator_);
mem_->Ref();
force = false; // Do not force another compaction if have room
MaybeScheduleCompaction();
}
}
return s;
}
bool DBImpl::GetProperty(const Slice& property, std::string* value) {
value->clear();
MutexLock l(&mutex_);
Slice in = property;
Slice prefix("leveldb.");
if (!in.starts_with(prefix)) return false;
in.remove_prefix(prefix.size());
if (in.starts_with("num-files-at-level")) {
in.remove_prefix(strlen("num-files-at-level"));
uint64_t level;
bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
if (!ok || level >= config::kNumLevels) {
return false;
} else {
char buf[100];
snprintf(buf, sizeof(buf), "%d",
versions_->NumLevelFiles(static_cast<int>(level)));
*value = buf;
return true;
}
} else if (in == "stats") {
char buf[200];
snprintf(buf, sizeof(buf),
" Compactions\n"
"Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n"
"--------------------------------------------------\n"
);
value->append(buf);
for (int level = 0; level < config::kNumLevels; level++) {
int files = versions_->NumLevelFiles(level);
if (stats_[level].micros > 0 || files > 0) {
snprintf(
buf, sizeof(buf),
"%3d %8d %8.0f %9.0f %8.0f %9.0f\n",
level,
files,
versions_->NumLevelBytes(level) / 1048576.0,
stats_[level].micros / 1e6,
stats_[level].bytes_read / 1048576.0,
stats_[level].bytes_written / 1048576.0);
value->append(buf);
}
}
return true;
} else if (in == "sstables") {
*value = versions_->current()->DebugString();
return true;
} else if (in == "approximate-memory-usage") {
size_t total_usage = options_.block_cache->TotalCharge();
if (mem_) {
total_usage += mem_->ApproximateMemoryUsage();
}
if (imm_) {
total_usage += imm_->ApproximateMemoryUsage();
}
char buf[50];
snprintf(buf, sizeof(buf), "%llu",
static_cast<unsigned long long>(total_usage));
value->append(buf);
return true;
}
return false;
}
void DBImpl::GetApproximateSizes(
const Range* range, int n,
uint64_t* sizes) {
// TODO(opt): better implementation
Version* v;
{
MutexLock l(&mutex_);
versions_->current()->Ref();
v = versions_->current();
}
for (int i = 0; i < n; i++) {
// Convert user_key into a corresponding internal key.
InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
uint64_t start = versions_->ApproximateOffsetOf(v, k1);
uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
sizes[i] = (limit >= start ? limit - start : 0);
}
{
MutexLock l(&mutex_);
v->Unref();
}
}
// Default implementations of convenience methods that subclasses of DB
// can call if they wish
Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
WriteBatch batch;
batch.Put(key, value);
return Write(opt, &batch);
}
Status DB::Delete(const WriteOptions& opt, const Slice& key) {
WriteBatch batch;
batch.Delete(key);
return Write(opt, &batch);
}
DB::~DB() { }
Status DB::Open(const Options& options, const std::string& dbname,
DB** dbptr) {
*dbptr = NULL;
DBImpl* impl = new DBImpl(options, dbname);
impl->mutex_.Lock();
VersionEdit edit;
// Recover handles create_if_missing, error_if_exists
bool save_manifest = false;
Status s = impl->Recover(&edit, &save_manifest);
if (s.ok() && impl->mem_ == NULL) {
// Create new log and a corresponding memtable.
uint64_t new_log_number = impl->versions_->NewFileNumber();
WritableFile* lfile;
s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
&lfile);
if (s.ok()) {
edit.SetLogNumber(new_log_number);
impl->logfile_ = lfile;
impl->logfile_number_ = new_log_number;
impl->log_ = new log::Writer(lfile);
impl->mem_ = new MemTable(impl->internal_comparator_);
impl->mem_->Ref();
}
}
if (s.ok() && save_manifest) {
edit.SetPrevLogNumber(0); // No older logs needed after recovery.
edit.SetLogNumber(impl->logfile_number_);
s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
}
if (s.ok()) {
impl->DeleteObsoleteFiles();
impl->MaybeScheduleCompaction();
}
impl->mutex_.Unlock();
if (s.ok()) {
assert(impl->mem_ != NULL);
*dbptr = impl;
} else {
delete impl;
}
return s;
}
Snapshot::~Snapshot() {
}
Status DestroyDB(const std::string& dbname, const Options& options) {
Env* env = options.env;
std::vector<std::string> filenames;
// Ignore error in case directory does not exist
env->GetChildren(dbname, &filenames);
if (filenames.empty()) {
return Status::OK();
}
FileLock* lock;
const std::string lockname = LockFileName(dbname);
Status result = env->LockFile(lockname, &lock);
if (result.ok()) {
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) &&
type != kDBLockFile) { // Lock file will be deleted at end
Status del = env->DeleteFile(dbname + "/" + filenames[i]);
if (result.ok() && !del.ok()) {
result = del;
}
}
}
env->UnlockFile(lock); // Ignore error since state is already gone
env->DeleteFile(lockname);
env->DeleteDir(dbname); // Ignore error in case dir contains other files
}
return result;
}
} // namespace leveldb
diff --git a/src/leveldb/db/leveldbutil.cc b/src/leveldb/db/leveldbutil.cc
index 9f4b7dd70..d06d64d64 100644
--- a/src/leveldb/db/leveldbutil.cc
+++ b/src/leveldb/db/leveldbutil.cc
@@ -1,64 +1,65 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h>
#include "leveldb/dumpfile.h"
#include "leveldb/env.h"
#include "leveldb/status.h"
namespace leveldb {
namespace {
class StdoutPrinter : public WritableFile {
public:
virtual Status Append(const Slice& data) {
fwrite(data.data(), 1, data.size(), stdout);
return Status::OK();
}
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
+ virtual std::string GetName() const { return "[stdout]"; }
};
bool HandleDumpCommand(Env* env, char** files, int num) {
StdoutPrinter printer;
bool ok = true;
for (int i = 0; i < num; i++) {
Status s = DumpFile(env, files[i], &printer);
if (!s.ok()) {
fprintf(stderr, "%s\n", s.ToString().c_str());
ok = false;
}
}
return ok;
}
} // namespace
} // namespace leveldb
static void Usage() {
fprintf(
stderr,
"Usage: leveldbutil command...\n"
" dump files... -- dump contents of specified files\n"
);
}
int main(int argc, char** argv) {
leveldb::Env* env = leveldb::Env::Default();
bool ok = true;
if (argc < 2) {
Usage();
ok = false;
} else {
std::string command = argv[1];
if (command == "dump") {
ok = leveldb::HandleDumpCommand(env, argv+2, argc-2);
} else {
Usage();
ok = false;
}
}
return (ok ? 0 : 1);
}
diff --git a/src/leveldb/db/log_reader.cc b/src/leveldb/db/log_reader.cc
index a6d304545..8b6ad136d 100644
--- a/src/leveldb/db/log_reader.cc
+++ b/src/leveldb/db/log_reader.cc
@@ -1,284 +1,284 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_reader.h"
#include <stdio.h>
#include "leveldb/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace leveldb {
namespace log {
Reader::Reporter::~Reporter() {
}
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
uint64_t initial_offset)
: file_(file),
reporter_(reporter),
checksum_(checksum),
backing_store_(new char[kBlockSize]),
buffer_(),
eof_(false),
last_record_offset_(0),
end_of_buffer_offset_(0),
initial_offset_(initial_offset),
resyncing_(initial_offset > 0) {
}
Reader::~Reader() {
delete[] backing_store_;
}
bool Reader::SkipToInitialBlock() {
size_t offset_in_block = initial_offset_ % kBlockSize;
uint64_t block_start_location = initial_offset_ - offset_in_block;
// Don't search a block if we'd be in the trailer
if (offset_in_block > kBlockSize - 6) {
offset_in_block = 0;
block_start_location += kBlockSize;
}
end_of_buffer_offset_ = block_start_location;
// Skip to start of first block that can contain the initial record
if (block_start_location > 0) {
Status skip_status = file_->Skip(block_start_location);
if (!skip_status.ok()) {
ReportDrop(block_start_location, skip_status);
return false;
}
}
return true;
}
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
if (last_record_offset_ < initial_offset_) {
if (!SkipToInitialBlock()) {
return false;
}
}
scratch->clear();
record->clear();
bool in_fragmented_record = false;
// Record offset of the logical record that we're reading
// 0 is a dummy value to make compilers happy
uint64_t prospective_record_offset = 0;
Slice fragment;
while (true) {
const unsigned int record_type = ReadPhysicalRecord(&fragment);
// ReadPhysicalRecord may have only had an empty trailer remaining in its
// internal buffer. Calculate the offset of the next physical record now
// that it has returned, properly accounting for its header size.
uint64_t physical_record_offset =
end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();
if (resyncing_) {
if (record_type == kMiddleType) {
continue;
} else if (record_type == kLastType) {
resyncing_ = false;
continue;
} else {
resyncing_ = false;
}
}
switch (record_type) {
case kFullType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(1)");
}
}
prospective_record_offset = physical_record_offset;
scratch->clear();
*record = fragment;
last_record_offset_ = prospective_record_offset;
return true;
case kFirstType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(2)");
}
}
prospective_record_offset = physical_record_offset;
scratch->assign(fragment.data(), fragment.size());
in_fragmented_record = true;
break;
case kMiddleType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(1)");
} else {
scratch->append(fragment.data(), fragment.size());
}
break;
case kLastType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(2)");
} else {
scratch->append(fragment.data(), fragment.size());
*record = Slice(*scratch);
last_record_offset_ = prospective_record_offset;
return true;
}
break;
case kEof:
if (in_fragmented_record) {
// This can be caused by the writer dying immediately after
// writing a physical record but before completing the next; don't
// treat it as a corruption, just ignore the entire logical record.
scratch->clear();
}
return false;
case kBadRecord:
if (in_fragmented_record) {
ReportCorruption(scratch->size(), "error in middle of record");
in_fragmented_record = false;
scratch->clear();
}
break;
default: {
char buf[40];
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
ReportCorruption(
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
buf);
in_fragmented_record = false;
scratch->clear();
break;
}
}
}
return false;
}
uint64_t Reader::LastRecordOffset() {
return last_record_offset_;
}
void Reader::ReportCorruption(uint64_t bytes, const char* reason) {
- ReportDrop(bytes, Status::Corruption(reason));
+ ReportDrop(bytes, Status::Corruption(reason, file_->GetName()));
}
void Reader::ReportDrop(uint64_t bytes, const Status& reason) {
if (reporter_ != NULL &&
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
reporter_->Corruption(static_cast<size_t>(bytes), reason);
}
}
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
while (true) {
if (buffer_.size() < kHeaderSize) {
if (!eof_) {
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) {
buffer_.clear();
ReportDrop(kBlockSize, status);
eof_ = true;
return kEof;
} else if (buffer_.size() < kBlockSize) {
eof_ = true;
}
continue;
} else {
// Note that if buffer_ is non-empty, we have a truncated header at the
// end of the file, which can be caused by the writer crashing in the
// middle of writing the header. Instead of considering this an error,
// just report EOF.
buffer_.clear();
return kEof;
}
}
// Parse the header
const char* header = buffer_.data();
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
const unsigned int type = header[6];
const uint32_t length = a | (b << 8);
if (kHeaderSize + length > buffer_.size()) {
size_t drop_size = buffer_.size();
buffer_.clear();
if (!eof_) {
ReportCorruption(drop_size, "bad record length");
return kBadRecord;
}
// If the end of the file has been reached without reading |length| bytes
// of payload, assume the writer died in the middle of writing the record.
// Don't report a corruption.
return kEof;
}
if (type == kZeroType && length == 0) {
// Skip zero length record without reporting any drops since
// such records are produced by the mmap based writing code in
// env_posix.cc that preallocates file regions.
buffer_.clear();
return kBadRecord;
}
// Check crc
if (checksum_) {
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
if (actual_crc != expected_crc) {
// Drop the rest of the buffer since "length" itself may have
// been corrupted and if we trust it, we could find some
// fragment of a real log record that just happens to look
// like a valid log record.
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "checksum mismatch");
return kBadRecord;
}
}
buffer_.remove_prefix(kHeaderSize + length);
// Skip physical record that started before initial_offset_
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
initial_offset_) {
result->clear();
return kBadRecord;
}
*result = Slice(header + kHeaderSize, length);
return type;
}
}
} // namespace log
} // namespace leveldb
diff --git a/src/leveldb/db/repair.cc b/src/leveldb/db/repair.cc
index 4cd4bb047..7281e3d34 100644
--- a/src/leveldb/db/repair.cc
+++ b/src/leveldb/db/repair.cc
@@ -1,461 +1,461 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// We recover the contents of the descriptor from the other files we find.
// (1) Any log files are first converted to tables
// (2) We scan every table to compute
// (a) smallest/largest for the table
// (b) largest sequence number in the table
// (3) We generate descriptor contents:
// - log number is set to zero
// - next-file-number is set to 1 + largest file number we found
// - last-sequence-number is set to largest sequence# found across
// all tables (see 2c)
// - compaction pointers are cleared
// - every table file is added at level 0
//
// Possible optimization 1:
// (a) Compute total size and use to pick appropriate max-level M
// (b) Sort tables by largest sequence# in the table
// (c) For each table: if it overlaps earlier table, place in level-0,
// else place in level-M.
// Possible optimization 2:
// Store per-table metadata (smallest, largest, largest-seq#, ...)
// in the table's meta section to speed up ScanTable.
#include "db/builder.h"
#include "db/db_impl.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "db/write_batch_internal.h"
#include "leveldb/comparator.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
namespace leveldb {
namespace {
class Repairer {
public:
Repairer(const std::string& dbname, const Options& options)
: dbname_(dbname),
env_(options.env),
icmp_(options.comparator),
ipolicy_(options.filter_policy),
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
owns_info_log_(options_.info_log != options.info_log),
owns_cache_(options_.block_cache != options.block_cache),
next_file_number_(1) {
// TableCache can be small since we expect each table to be opened once.
table_cache_ = new TableCache(dbname_, &options_, 10);
}
~Repairer() {
delete table_cache_;
if (owns_info_log_) {
delete options_.info_log;
}
if (owns_cache_) {
delete options_.block_cache;
}
}
Status Run() {
Status status = FindFiles();
if (status.ok()) {
ConvertLogFilesToTables();
ExtractMetaData();
status = WriteDescriptor();
}
if (status.ok()) {
unsigned long long bytes = 0;
for (size_t i = 0; i < tables_.size(); i++) {
bytes += tables_[i].meta.file_size;
}
Log(options_.info_log,
"**** Repaired leveldb %s; "
"recovered %d files; %llu bytes. "
"Some data may have been lost. "
"****",
dbname_.c_str(),
static_cast<int>(tables_.size()),
bytes);
}
return status;
}
private:
struct TableInfo {
FileMetaData meta;
SequenceNumber max_sequence;
};
std::string const dbname_;
Env* const env_;
InternalKeyComparator const icmp_;
InternalFilterPolicy const ipolicy_;
Options const options_;
bool owns_info_log_;
bool owns_cache_;
TableCache* table_cache_;
VersionEdit edit_;
std::vector<std::string> manifests_;
std::vector<uint64_t> table_numbers_;
std::vector<uint64_t> logs_;
std::vector<TableInfo> tables_;
uint64_t next_file_number_;
Status FindFiles() {
std::vector<std::string> filenames;
Status status = env_->GetChildren(dbname_, &filenames);
if (!status.ok()) {
return status;
}
if (filenames.empty()) {
return Status::IOError(dbname_, "repair found no files");
}
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) {
if (type == kDescriptorFile) {
manifests_.push_back(filenames[i]);
} else {
if (number + 1 > next_file_number_) {
next_file_number_ = number + 1;
}
if (type == kLogFile) {
logs_.push_back(number);
} else if (type == kTableFile) {
table_numbers_.push_back(number);
} else {
// Ignore other files
}
}
}
}
return status;
}
void ConvertLogFilesToTables() {
for (size_t i = 0; i < logs_.size(); i++) {
std::string logname = LogFileName(dbname_, logs_[i]);
Status status = ConvertLogToTable(logs_[i]);
if (!status.ok()) {
Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
(unsigned long long) logs_[i],
status.ToString().c_str());
}
ArchiveFile(logname);
}
}
Status ConvertLogToTable(uint64_t log) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
Logger* info_log;
uint64_t lognum;
virtual void Corruption(size_t bytes, const Status& s) {
// We print error messages for corruption, but continue repairing.
Log(info_log, "Log #%llu: dropping %d bytes; %s",
(unsigned long long) lognum,
static_cast<int>(bytes),
s.ToString().c_str());
}
};
// Open the log file
std::string logname = LogFileName(dbname_, log);
SequentialFile* lfile;
Status status = env_->NewSequentialFile(logname, &lfile);
if (!status.ok()) {
return status;
}
// Create the log reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.lognum = log;
// We intentionally make log::Reader do checksumming so that
// corruptions cause entire commits to be skipped instead of
// propagating bad information (like overly large sequence
// numbers).
log::Reader reader(lfile, &reporter, false/*do not checksum*/,
0/*initial_offset*/);
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
MemTable* mem = new MemTable(icmp_);
mem->Ref();
int counter = 0;
while (reader.ReadRecord(&record, &scratch)) {
if (record.size() < 12) {
reporter.Corruption(
- record.size(), Status::Corruption("log record too small"));
+ record.size(), Status::Corruption("log record too small", logname));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, mem);
if (status.ok()) {
counter += WriteBatchInternal::Count(&batch);
} else {
Log(options_.info_log, "Log #%llu: ignoring %s",
(unsigned long long) log,
status.ToString().c_str());
status = Status::OK(); // Keep going with rest of file
}
}
delete lfile;
// Do not record a version edit for this conversion to a Table
// since ExtractMetaData() will also generate edits.
FileMetaData meta;
meta.number = next_file_number_++;
Iterator* iter = mem->NewIterator();
status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
delete iter;
mem->Unref();
mem = NULL;
if (status.ok()) {
if (meta.file_size > 0) {
table_numbers_.push_back(meta.number);
}
}
Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
(unsigned long long) log,
counter,
(unsigned long long) meta.number,
status.ToString().c_str());
return status;
}
void ExtractMetaData() {
for (size_t i = 0; i < table_numbers_.size(); i++) {
ScanTable(table_numbers_[i]);
}
}
Iterator* NewTableIterator(const FileMetaData& meta) {
// Same as compaction iterators: if paranoid_checks are on, turn
// on checksum verification.
ReadOptions r;
r.verify_checksums = options_.paranoid_checks;
return table_cache_->NewIterator(r, meta.number, meta.file_size);
}
void ScanTable(uint64_t number) {
TableInfo t;
t.meta.number = number;
std::string fname = TableFileName(dbname_, number);
Status status = env_->GetFileSize(fname, &t.meta.file_size);
if (!status.ok()) {
// Try alternate file name.
fname = SSTTableFileName(dbname_, number);
Status s2 = env_->GetFileSize(fname, &t.meta.file_size);
if (s2.ok()) {
status = Status::OK();
}
}
if (!status.ok()) {
ArchiveFile(TableFileName(dbname_, number));
ArchiveFile(SSTTableFileName(dbname_, number));
Log(options_.info_log, "Table #%llu: dropped: %s",
(unsigned long long) t.meta.number,
status.ToString().c_str());
return;
}
// Extract metadata by scanning through table.
int counter = 0;
Iterator* iter = NewTableIterator(t.meta);
bool empty = true;
ParsedInternalKey parsed;
t.max_sequence = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key();
if (!ParseInternalKey(key, &parsed)) {
Log(options_.info_log, "Table #%llu: unparsable key %s",
(unsigned long long) t.meta.number,
EscapeString(key).c_str());
continue;
}
counter++;
if (empty) {
empty = false;
t.meta.smallest.DecodeFrom(key);
}
t.meta.largest.DecodeFrom(key);
if (parsed.sequence > t.max_sequence) {
t.max_sequence = parsed.sequence;
}
}
if (!iter->status().ok()) {
status = iter->status();
}
delete iter;
Log(options_.info_log, "Table #%llu: %d entries %s",
(unsigned long long) t.meta.number,
counter,
status.ToString().c_str());
if (status.ok()) {
tables_.push_back(t);
} else {
RepairTable(fname, t); // RepairTable archives input file.
}
}
void RepairTable(const std::string& src, TableInfo t) {
// We will copy src contents to a new table and then rename the
// new table over the source.
// Create builder.
std::string copy = TableFileName(dbname_, next_file_number_++);
WritableFile* file;
Status s = env_->NewWritableFile(copy, &file);
if (!s.ok()) {
return;
}
TableBuilder* builder = new TableBuilder(options_, file);
// Copy data.
Iterator* iter = NewTableIterator(t.meta);
int counter = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
builder->Add(iter->key(), iter->value());
counter++;
}
delete iter;
ArchiveFile(src);
if (counter == 0) {
builder->Abandon(); // Nothing to save
} else {
s = builder->Finish();
if (s.ok()) {
t.meta.file_size = builder->FileSize();
}
}
delete builder;
builder = NULL;
if (s.ok()) {
s = file->Close();
}
delete file;
file = NULL;
if (counter > 0 && s.ok()) {
std::string orig = TableFileName(dbname_, t.meta.number);
s = env_->RenameFile(copy, orig);
if (s.ok()) {
Log(options_.info_log, "Table #%llu: %d entries repaired",
(unsigned long long) t.meta.number, counter);
tables_.push_back(t);
}
}
if (!s.ok()) {
env_->DeleteFile(copy);
}
}
Status WriteDescriptor() {
std::string tmp = TempFileName(dbname_, 1);
WritableFile* file;
Status status = env_->NewWritableFile(tmp, &file);
if (!status.ok()) {
return status;
}
SequenceNumber max_sequence = 0;
for (size_t i = 0; i < tables_.size(); i++) {
if (max_sequence < tables_[i].max_sequence) {
max_sequence = tables_[i].max_sequence;
}
}
edit_.SetComparatorName(icmp_.user_comparator()->Name());
edit_.SetLogNumber(0);
edit_.SetNextFile(next_file_number_);
edit_.SetLastSequence(max_sequence);
for (size_t i = 0; i < tables_.size(); i++) {
// TODO(opt): separate out into multiple levels
const TableInfo& t = tables_[i];
edit_.AddFile(0, t.meta.number, t.meta.file_size,
t.meta.smallest, t.meta.largest);
}
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
{
log::Writer log(file);
std::string record;
edit_.EncodeTo(&record);
status = log.AddRecord(record);
}
if (status.ok()) {
status = file->Close();
}
delete file;
file = NULL;
if (!status.ok()) {
env_->DeleteFile(tmp);
} else {
// Discard older manifests
for (size_t i = 0; i < manifests_.size(); i++) {
ArchiveFile(dbname_ + "/" + manifests_[i]);
}
// Install new manifest
status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
if (status.ok()) {
status = SetCurrentFile(env_, dbname_, 1);
} else {
env_->DeleteFile(tmp);
}
}
return status;
}
void ArchiveFile(const std::string& fname) {
// Move into another directory. E.g., for
// dir/foo
// rename to
// dir/lost/foo
const char* slash = strrchr(fname.c_str(), '/');
std::string new_dir;
if (slash != NULL) {
new_dir.assign(fname.data(), slash - fname.data());
}
new_dir.append("/lost");
env_->CreateDir(new_dir); // Ignore error
std::string new_file = new_dir;
new_file.append("/");
new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
Status s = env_->RenameFile(fname, new_file);
Log(options_.info_log, "Archiving %s: %s\n",
fname.c_str(), s.ToString().c_str());
}
};
} // namespace
Status RepairDB(const std::string& dbname, const Options& options) {
Repairer repairer(dbname, options);
return repairer.Run();
}
} // namespace leveldb
diff --git a/src/leveldb/helpers/memenv/memenv.cc b/src/leveldb/helpers/memenv/memenv.cc
index 9a98884da..68c0614a5 100644
--- a/src/leveldb/helpers/memenv/memenv.cc
+++ b/src/leveldb/helpers/memenv/memenv.cc
@@ -1,398 +1,401 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "helpers/memenv/memenv.h"
#include "leveldb/env.h"
#include "leveldb/status.h"
#include "port/port.h"
#include "util/mutexlock.h"
#include <map>
#include <string.h>
#include <string>
#include <vector>
namespace leveldb {
namespace {
class FileState {
public:
// FileStates are reference counted. The initial reference count is zero
// and the caller must call Ref() at least once.
FileState() : refs_(0), size_(0) {}
// Increase the reference count.
void Ref() {
MutexLock lock(&refs_mutex_);
++refs_;
}
// Decrease the reference count. Delete if this is the last reference.
void Unref() {
bool do_delete = false;
{
MutexLock lock(&refs_mutex_);
--refs_;
assert(refs_ >= 0);
if (refs_ <= 0) {
do_delete = true;
}
}
if (do_delete) {
delete this;
}
}
uint64_t Size() const { return size_; }
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
if (offset > size_) {
return Status::IOError("Offset greater than file size.");
}
const uint64_t available = size_ - offset;
if (n > available) {
n = static_cast<size_t>(available);
}
if (n == 0) {
*result = Slice();
return Status::OK();
}
assert(offset / kBlockSize <= SIZE_MAX);
size_t block = static_cast<size_t>(offset / kBlockSize);
size_t block_offset = offset % kBlockSize;
if (n <= kBlockSize - block_offset) {
// The requested bytes are all in the first block.
*result = Slice(blocks_[block] + block_offset, n);
return Status::OK();
}
size_t bytes_to_copy = n;
char* dst = scratch;
while (bytes_to_copy > 0) {
size_t avail = kBlockSize - block_offset;
if (avail > bytes_to_copy) {
avail = bytes_to_copy;
}
memcpy(dst, blocks_[block] + block_offset, avail);
bytes_to_copy -= avail;
dst += avail;
block++;
block_offset = 0;
}
*result = Slice(scratch, n);
return Status::OK();
}
Status Append(const Slice& data) {
const char* src = data.data();
size_t src_len = data.size();
while (src_len > 0) {
size_t avail;
size_t offset = size_ % kBlockSize;
if (offset != 0) {
// There is some room in the last block.
avail = kBlockSize - offset;
} else {
// No room in the last block; push new one.
blocks_.push_back(new char[kBlockSize]);
avail = kBlockSize;
}
if (avail > src_len) {
avail = src_len;
}
memcpy(blocks_.back() + offset, src, avail);
src_len -= avail;
src += avail;
size_ += avail;
}
return Status::OK();
}
private:
// Private since only Unref() should be used to delete it.
~FileState() {
for (std::vector<char*>::iterator i = blocks_.begin(); i != blocks_.end();
++i) {
delete [] *i;
}
}
// No copying allowed.
FileState(const FileState&);
void operator=(const FileState&);
port::Mutex refs_mutex_;
int refs_; // Protected by refs_mutex_;
// The following fields are not protected by any mutex. They are only mutable
// while the file is being written, and concurrent access is not allowed
// to writable files.
std::vector<char*> blocks_;
uint64_t size_;
enum { kBlockSize = 8 * 1024 };
};
class SequentialFileImpl : public SequentialFile {
public:
explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) {
file_->Ref();
}
~SequentialFileImpl() {
file_->Unref();
}
virtual Status Read(size_t n, Slice* result, char* scratch) {
Status s = file_->Read(pos_, n, result, scratch);
if (s.ok()) {
pos_ += result->size();
}
return s;
}
virtual Status Skip(uint64_t n) {
if (pos_ > file_->Size()) {
return Status::IOError("pos_ > file_->Size()");
}
const uint64_t available = file_->Size() - pos_;
if (n > available) {
n = available;
}
pos_ += n;
return Status::OK();
}
+ virtual std::string GetName() const { return "[memenv]"; }
private:
FileState* file_;
uint64_t pos_;
};
class RandomAccessFileImpl : public RandomAccessFile {
public:
explicit RandomAccessFileImpl(FileState* file) : file_(file) {
file_->Ref();
}
~RandomAccessFileImpl() {
file_->Unref();
}
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
return file_->Read(offset, n, result, scratch);
}
+ virtual std::string GetName() const { return "[memenv]"; }
private:
FileState* file_;
};
class WritableFileImpl : public WritableFile {
public:
WritableFileImpl(FileState* file) : file_(file) {
file_->Ref();
}
~WritableFileImpl() {
file_->Unref();
}
virtual Status Append(const Slice& data) {
return file_->Append(data);
}
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
+ virtual std::string GetName() const { return "[memenv]"; }
private:
FileState* file_;
};
class NoOpLogger : public Logger {
public:
virtual void Logv(const char* format, va_list ap) { }
};
class InMemoryEnv : public EnvWrapper {
public:
explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
virtual ~InMemoryEnv() {
for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
i->second->Unref();
}
}
// Partial implementation of the Env interface.
virtual Status NewSequentialFile(const std::string& fname,
SequentialFile** result) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
*result = NULL;
return Status::IOError(fname, "File not found");
}
*result = new SequentialFileImpl(file_map_[fname]);
return Status::OK();
}
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
*result = NULL;
return Status::IOError(fname, "File not found");
}
*result = new RandomAccessFileImpl(file_map_[fname]);
return Status::OK();
}
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) != file_map_.end()) {
DeleteFileInternal(fname);
}
FileState* file = new FileState();
file->Ref();
file_map_[fname] = file;
*result = new WritableFileImpl(file);
return Status::OK();
}
virtual Status NewAppendableFile(const std::string& fname,
WritableFile** result) {
MutexLock lock(&mutex_);
FileState** sptr = &file_map_[fname];
FileState* file = *sptr;
if (file == NULL) {
file = new FileState();
file->Ref();
}
*result = new WritableFileImpl(file);
return Status::OK();
}
virtual bool FileExists(const std::string& fname) {
MutexLock lock(&mutex_);
return file_map_.find(fname) != file_map_.end();
}
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) {
MutexLock lock(&mutex_);
result->clear();
for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
const std::string& filename = i->first;
if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
Slice(filename).starts_with(Slice(dir))) {
result->push_back(filename.substr(dir.size() + 1));
}
}
return Status::OK();
}
void DeleteFileInternal(const std::string& fname) {
if (file_map_.find(fname) == file_map_.end()) {
return;
}
file_map_[fname]->Unref();
file_map_.erase(fname);
}
virtual Status DeleteFile(const std::string& fname) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
return Status::IOError(fname, "File not found");
}
DeleteFileInternal(fname);
return Status::OK();
}
virtual Status CreateDir(const std::string& dirname) {
return Status::OK();
}
virtual Status DeleteDir(const std::string& dirname) {
return Status::OK();
}
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
return Status::IOError(fname, "File not found");
}
*file_size = file_map_[fname]->Size();
return Status::OK();
}
virtual Status RenameFile(const std::string& src,
const std::string& target) {
MutexLock lock(&mutex_);
if (file_map_.find(src) == file_map_.end()) {
return Status::IOError(src, "File not found");
}
DeleteFileInternal(target);
file_map_[target] = file_map_[src];
file_map_.erase(src);
return Status::OK();
}
virtual Status LockFile(const std::string& fname, FileLock** lock) {
*lock = new FileLock;
return Status::OK();
}
virtual Status UnlockFile(FileLock* lock) {
delete lock;
return Status::OK();
}
virtual Status GetTestDirectory(std::string* path) {
*path = "/test";
return Status::OK();
}
virtual Status NewLogger(const std::string& fname, Logger** result) {
*result = new NoOpLogger;
return Status::OK();
}
private:
// Map from filenames to FileState objects, representing a simple file system.
typedef std::map<std::string, FileState*> FileSystem;
port::Mutex mutex_;
FileSystem file_map_; // Protected by mutex_.
};
} // namespace
Env* NewMemEnv(Env* base_env) {
return new InMemoryEnv(base_env);
}
} // namespace leveldb
diff --git a/src/leveldb/include/leveldb/env.h b/src/leveldb/include/leveldb/env.h
index 99b6c2141..275d441ea 100644
--- a/src/leveldb/include/leveldb/env.h
+++ b/src/leveldb/include/leveldb/env.h
@@ -1,351 +1,360 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An Env is an interface used by the leveldb implementation to access
// operating system functionality like the filesystem etc. Callers
// may wish to provide a custom Env object when opening a database to
// get fine gain control; e.g., to rate limit file system operations.
//
// All Env implementations are safe for concurrent access from
// multiple threads without any external synchronization.
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
#define STORAGE_LEVELDB_INCLUDE_ENV_H_
#include <string>
#include <vector>
#include <stdarg.h>
#include <stdint.h>
#include "leveldb/status.h"
namespace leveldb {
class FileLock;
class Logger;
class RandomAccessFile;
class SequentialFile;
class Slice;
class WritableFile;
class Env {
public:
Env() { }
virtual ~Env();
// Return a default environment suitable for the current operating
// system. Sophisticated users may wish to provide their own Env
// implementation instead of relying on this default environment.
//
// The result of Default() belongs to leveldb and must never be deleted.
static Env* Default();
// Create a brand new sequentially-readable file with the specified name.
// On success, stores a pointer to the new file in *result and returns OK.
// On failure stores NULL in *result and returns non-OK. If the file does
// not exist, returns a non-OK status.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewSequentialFile(const std::string& fname,
SequentialFile** result) = 0;
// Create a brand new random access read-only file with the
// specified name. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores NULL in *result and
// returns non-OK. If the file does not exist, returns a non-OK
// status.
//
// The returned file may be concurrently accessed by multiple threads.
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result) = 0;
// Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a
// new file. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores NULL in *result and
// returns non-OK.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) = 0;
// Create an object that either appends to an existing file, or
// writes to a new file (if the file does not exist to begin with).
// On success, stores a pointer to the new file in *result and
// returns OK. On failure stores NULL in *result and returns
// non-OK.
//
// The returned file will only be accessed by one thread at a time.
//
// May return an IsNotSupportedError error if this Env does
// not allow appending to an existing file. Users of Env (including
// the leveldb implementation) must be prepared to deal with
// an Env that does not support appending.
virtual Status NewAppendableFile(const std::string& fname,
WritableFile** result);
// Returns true iff the named file exists.
virtual bool FileExists(const std::string& fname) = 0;
// Store in *result the names of the children of the specified directory.
// The names are relative to "dir".
// Original contents of *results are dropped.
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) = 0;
// Delete the named file.
virtual Status DeleteFile(const std::string& fname) = 0;
// Create the specified directory.
virtual Status CreateDir(const std::string& dirname) = 0;
// Delete the specified directory.
virtual Status DeleteDir(const std::string& dirname) = 0;
// Store the size of fname in *file_size.
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
// Rename file src to target.
virtual Status RenameFile(const std::string& src,
const std::string& target) = 0;
// Lock the specified file. Used to prevent concurrent access to
// the same db by multiple processes. On failure, stores NULL in
// *lock and returns non-OK.
//
// On success, stores a pointer to the object that represents the
// acquired lock in *lock and returns OK. The caller should call
// UnlockFile(*lock) to release the lock. If the process exits,
// the lock will be automatically released.
//
// If somebody else already holds the lock, finishes immediately
// with a failure. I.e., this call does not wait for existing locks
// to go away.
//
// May create the named file if it does not already exist.
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
// Release the lock acquired by a previous successful call to LockFile.
// REQUIRES: lock was returned by a successful LockFile() call
// REQUIRES: lock has not already been unlocked.
virtual Status UnlockFile(FileLock* lock) = 0;
// Arrange to run "(*function)(arg)" once in a background thread.
//
// "function" may run in an unspecified thread. Multiple functions
// added to the same Env may run concurrently in different threads.
// I.e., the caller may not assume that background work items are
// serialized.
virtual void Schedule(
void (*function)(void* arg),
void* arg) = 0;
// Start a new thread, invoking "function(arg)" within the new thread.
// When "function(arg)" returns, the thread will be destroyed.
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
// *path is set to a temporary directory that can be used for testing. It may
// or many not have just been created. The directory may or may not differ
// between runs of the same process, but subsequent calls will return the
// same directory.
virtual Status GetTestDirectory(std::string* path) = 0;
// Create and return a log file for storing informational messages.
virtual Status NewLogger(const std::string& fname, Logger** result) = 0;
// Returns the number of micro-seconds since some fixed point in time. Only
// useful for computing deltas of time.
virtual uint64_t NowMicros() = 0;
// Sleep/delay the thread for the prescribed number of micro-seconds.
virtual void SleepForMicroseconds(int micros) = 0;
private:
// No copying allowed
Env(const Env&);
void operator=(const Env&);
};
// A file abstraction for reading sequentially through a file
class SequentialFile {
public:
SequentialFile() { }
virtual ~SequentialFile();
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
// written by this routine. Sets "*result" to the data that was
// read (including if fewer than "n" bytes were successfully read).
// May set "*result" to point at data in "scratch[0..n-1]", so
// "scratch[0..n-1]" must be live when "*result" is used.
// If an error was encountered, returns a non-OK status.
//
// REQUIRES: External synchronization
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
// Skip "n" bytes from the file. This is guaranteed to be no
// slower that reading the same data, but may be faster.
//
// If end of file is reached, skipping will stop at the end of the
// file, and Skip will return OK.
//
// REQUIRES: External synchronization
virtual Status Skip(uint64_t n) = 0;
+ // Get a name for the file, only for error reporting
+ virtual std::string GetName() const = 0;
+
private:
// No copying allowed
SequentialFile(const SequentialFile&);
void operator=(const SequentialFile&);
};
// A file abstraction for randomly reading the contents of a file.
class RandomAccessFile {
public:
RandomAccessFile() { }
virtual ~RandomAccessFile();
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). May set "*result" to point at data in
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
// "*result" is used. If an error was encountered, returns a non-OK
// status.
//
// Safe for concurrent use by multiple threads.
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0;
+ // Get a name for the file, only for error reporting
+ virtual std::string GetName() const = 0;
+
private:
// No copying allowed
RandomAccessFile(const RandomAccessFile&);
void operator=(const RandomAccessFile&);
};
// A file abstraction for sequential writing. The implementation
// must provide buffering since callers may append small fragments
// at a time to the file.
class WritableFile {
public:
WritableFile() { }
virtual ~WritableFile();
virtual Status Append(const Slice& data) = 0;
virtual Status Close() = 0;
virtual Status Flush() = 0;
virtual Status Sync() = 0;
+ // Get a name for the file, only for error reporting
+ virtual std::string GetName() const = 0;
+
private:
// No copying allowed
WritableFile(const WritableFile&);
void operator=(const WritableFile&);
};
// An interface for writing log messages.
class Logger {
public:
Logger() { }
virtual ~Logger();
// Write an entry to the log file with the specified format.
virtual void Logv(const char* format, va_list ap) = 0;
private:
// No copying allowed
Logger(const Logger&);
void operator=(const Logger&);
};
// Identifies a locked file.
class FileLock {
public:
FileLock() { }
virtual ~FileLock();
private:
// No copying allowed
FileLock(const FileLock&);
void operator=(const FileLock&);
};
// Log the specified data to *info_log if info_log is non-NULL.
extern void Log(Logger* info_log, const char* format, ...)
# if defined(__GNUC__) || defined(__clang__)
__attribute__((__format__ (__printf__, 2, 3)))
# endif
;
// A utility routine: write "data" to the named file.
extern Status WriteStringToFile(Env* env, const Slice& data,
const std::string& fname);
// A utility routine: read contents of named file into *data
extern Status ReadFileToString(Env* env, const std::string& fname,
std::string* data);
// An implementation of Env that forwards all calls to another Env.
// May be useful to clients who wish to override just part of the
// functionality of another Env.
class EnvWrapper : public Env {
public:
// Initialize an EnvWrapper that delegates all calls to *t
explicit EnvWrapper(Env* t) : target_(t) { }
virtual ~EnvWrapper();
// Return the target to which this Env forwards all calls
Env* target() const { return target_; }
// The following text is boilerplate that forwards all methods to target()
Status NewSequentialFile(const std::string& f, SequentialFile** r) {
return target_->NewSequentialFile(f, r);
}
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
return target_->NewRandomAccessFile(f, r);
}
Status NewWritableFile(const std::string& f, WritableFile** r) {
return target_->NewWritableFile(f, r);
}
Status NewAppendableFile(const std::string& f, WritableFile** r) {
return target_->NewAppendableFile(f, r);
}
bool FileExists(const std::string& f) { return target_->FileExists(f); }
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
return target_->GetChildren(dir, r);
}
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
Status GetFileSize(const std::string& f, uint64_t* s) {
return target_->GetFileSize(f, s);
}
Status RenameFile(const std::string& s, const std::string& t) {
return target_->RenameFile(s, t);
}
Status LockFile(const std::string& f, FileLock** l) {
return target_->LockFile(f, l);
}
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
void Schedule(void (*f)(void*), void* a) {
return target_->Schedule(f, a);
}
void StartThread(void (*f)(void*), void* a) {
return target_->StartThread(f, a);
}
virtual Status GetTestDirectory(std::string* path) {
return target_->GetTestDirectory(path);
}
virtual Status NewLogger(const std::string& fname, Logger** result) {
return target_->NewLogger(fname, result);
}
uint64_t NowMicros() {
return target_->NowMicros();
}
void SleepForMicroseconds(int micros) {
target_->SleepForMicroseconds(micros);
}
private:
Env* target_;
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_
diff --git a/src/leveldb/table/format.cc b/src/leveldb/table/format.cc
index 24e4e0244..285e1c0de 100644
--- a/src/leveldb/table/format.cc
+++ b/src/leveldb/table/format.cc
@@ -1,144 +1,144 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/format.h"
#include "leveldb/env.h"
#include "port/port.h"
#include "table/block.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace leveldb {
void BlockHandle::EncodeTo(std::string* dst) const {
// Sanity check that all fields have been set
assert(offset_ != ~static_cast<uint64_t>(0));
assert(size_ != ~static_cast<uint64_t>(0));
PutVarint64(dst, offset_);
PutVarint64(dst, size_);
}
Status BlockHandle::DecodeFrom(Slice* input) {
if (GetVarint64(input, &offset_) &&
GetVarint64(input, &size_)) {
return Status::OK();
} else {
return Status::Corruption("bad block handle");
}
}
void Footer::EncodeTo(std::string* dst) const {
const size_t original_size = dst->size();
metaindex_handle_.EncodeTo(dst);
index_handle_.EncodeTo(dst);
dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding
PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber & 0xffffffffu));
PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber >> 32));
assert(dst->size() == original_size + kEncodedLength);
(void)original_size; // Disable unused variable warning.
}
Status Footer::DecodeFrom(Slice* input) {
const char* magic_ptr = input->data() + kEncodedLength - 8;
const uint32_t magic_lo = DecodeFixed32(magic_ptr);
const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
(static_cast<uint64_t>(magic_lo)));
if (magic != kTableMagicNumber) {
return Status::Corruption("not an sstable (bad magic number)");
}
Status result = metaindex_handle_.DecodeFrom(input);
if (result.ok()) {
result = index_handle_.DecodeFrom(input);
}
if (result.ok()) {
// We skip over any leftover data (just padding for now) in "input"
const char* end = magic_ptr + 8;
*input = Slice(end, input->data() + input->size() - end);
}
return result;
}
Status ReadBlock(RandomAccessFile* file,
const ReadOptions& options,
const BlockHandle& handle,
BlockContents* result) {
result->data = Slice();
result->cachable = false;
result->heap_allocated = false;
// Read the block contents as well as the type/crc footer.
// See table_builder.cc for the code that built this structure.
size_t n = static_cast<size_t>(handle.size());
char* buf = new char[n + kBlockTrailerSize];
Slice contents;
Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
if (!s.ok()) {
delete[] buf;
return s;
}
if (contents.size() != n + kBlockTrailerSize) {
delete[] buf;
- return Status::Corruption("truncated block read");
+ return Status::Corruption("truncated block read", file->GetName());
}
// Check the crc of the type and the block contents
const char* data = contents.data(); // Pointer to where Read put the data
if (options.verify_checksums) {
const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1));
const uint32_t actual = crc32c::Value(data, n + 1);
if (actual != crc) {
delete[] buf;
- s = Status::Corruption("block checksum mismatch");
+ s = Status::Corruption("block checksum mismatch", file->GetName());
return s;
}
}
switch (data[n]) {
case kNoCompression:
if (data != buf) {
// File implementation gave us pointer to some other data.
// Use it directly under the assumption that it will be live
// while the file is open.
delete[] buf;
result->data = Slice(data, n);
result->heap_allocated = false;
result->cachable = false; // Do not double-cache
} else {
result->data = Slice(buf, n);
result->heap_allocated = true;
result->cachable = true;
}
// Ok
break;
case kSnappyCompression: {
size_t ulength = 0;
if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) {
delete[] buf;
- return Status::Corruption("corrupted compressed block contents");
+ return Status::Corruption("corrupted compressed block contents", file->GetName());
}
char* ubuf = new char[ulength];
if (!port::Snappy_Uncompress(data, n, ubuf)) {
delete[] buf;
delete[] ubuf;
- return Status::Corruption("corrupted compressed block contents");
+ return Status::Corruption("corrupted compressed block contents", file->GetName());
}
delete[] buf;
result->data = Slice(ubuf, ulength);
result->heap_allocated = true;
result->cachable = true;
break;
}
default:
delete[] buf;
- return Status::Corruption("bad block type");
+ return Status::Corruption("bad block type", file->GetName());
}
return Status::OK();
}
} // namespace leveldb
diff --git a/src/leveldb/util/env_posix.cc b/src/leveldb/util/env_posix.cc
index dd852af35..4676bc224 100644
--- a/src/leveldb/util/env_posix.cc
+++ b/src/leveldb/util/env_posix.cc
@@ -1,698 +1,706 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#if !defined(LEVELDB_PLATFORM_WINDOWS)
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include <deque>
#include <limits>
#include <set>
#include "leveldb/env.h"
#include "leveldb/slice.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/posix_logger.h"
#include "util/env_posix_test_helper.h"
namespace leveldb {
namespace {
static int open_read_only_file_limit = -1;
static int mmap_limit = -1;
static Status IOError(const std::string& context, int err_number) {
return Status::IOError(context, strerror(err_number));
}
// Helper class to limit resource usage to avoid exhaustion.
// Currently used to limit read-only file descriptors and mmap file usage
// so that we do not end up running out of file descriptors, virtual memory,
// or running into kernel performance problems for very large databases.
class Limiter {
public:
// Limit maximum number of resources to |n|.
Limiter(intptr_t n) {
SetAllowed(n);
}
// If another resource is available, acquire it and return true.
// Else return false.
bool Acquire() {
if (GetAllowed() <= 0) {
return false;
}
MutexLock l(&mu_);
intptr_t x = GetAllowed();
if (x <= 0) {
return false;
} else {
SetAllowed(x - 1);
return true;
}
}
// Release a resource acquired by a previous call to Acquire() that returned
// true.
void Release() {
MutexLock l(&mu_);
SetAllowed(GetAllowed() + 1);
}
private:
port::Mutex mu_;
port::AtomicPointer allowed_;
intptr_t GetAllowed() const {
return reinterpret_cast<intptr_t>(allowed_.Acquire_Load());
}
// REQUIRES: mu_ must be held
void SetAllowed(intptr_t v) {
allowed_.Release_Store(reinterpret_cast<void*>(v));
}
Limiter(const Limiter&);
void operator=(const Limiter&);
};
class PosixSequentialFile: public SequentialFile {
private:
std::string filename_;
FILE* file_;
public:
PosixSequentialFile(const std::string& fname, FILE* f)
: filename_(fname), file_(f) { }
virtual ~PosixSequentialFile() { fclose(file_); }
virtual Status Read(size_t n, Slice* result, char* scratch) {
Status s;
size_t r = fread_unlocked(scratch, 1, n, file_);
*result = Slice(scratch, r);
if (r < n) {
if (feof(file_)) {
// We leave status as ok if we hit the end of the file
} else {
// A partial read with an error: return a non-ok status
s = IOError(filename_, errno);
}
}
return s;
}
virtual Status Skip(uint64_t n) {
if (fseek(file_, n, SEEK_CUR)) {
return IOError(filename_, errno);
}
return Status::OK();
}
+
+ virtual std::string GetName() const { return filename_; }
};
// pread() based random-access
class PosixRandomAccessFile: public RandomAccessFile {
private:
std::string filename_;
bool temporary_fd_; // If true, fd_ is -1 and we open on every read.
int fd_;
Limiter* limiter_;
public:
PosixRandomAccessFile(const std::string& fname, int fd, Limiter* limiter)
: filename_(fname), fd_(fd), limiter_(limiter) {
temporary_fd_ = !limiter->Acquire();
if (temporary_fd_) {
// Open file on every access.
close(fd_);
fd_ = -1;
}
}
virtual ~PosixRandomAccessFile() {
if (!temporary_fd_) {
close(fd_);
limiter_->Release();
}
}
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
int fd = fd_;
if (temporary_fd_) {
fd = open(filename_.c_str(), O_RDONLY);
if (fd < 0) {
return IOError(filename_, errno);
}
}
Status s;
ssize_t r = pread(fd, scratch, n, static_cast<off_t>(offset));
*result = Slice(scratch, (r < 0) ? 0 : r);
if (r < 0) {
// An error: return a non-ok status
s = IOError(filename_, errno);
}
if (temporary_fd_) {
// Close the temporary file descriptor opened earlier.
close(fd);
}
return s;
}
+
+ virtual std::string GetName() const { return filename_; }
};
// mmap() based random-access
class PosixMmapReadableFile: public RandomAccessFile {
private:
std::string filename_;
void* mmapped_region_;
size_t length_;
Limiter* limiter_;
public:
// base[0,length-1] contains the mmapped contents of the file.
PosixMmapReadableFile(const std::string& fname, void* base, size_t length,
Limiter* limiter)
: filename_(fname), mmapped_region_(base), length_(length),
limiter_(limiter) {
}
virtual ~PosixMmapReadableFile() {
munmap(mmapped_region_, length_);
limiter_->Release();
}
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
Status s;
if (offset + n > length_) {
*result = Slice();
s = IOError(filename_, EINVAL);
} else {
*result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
}
return s;
}
+
+ virtual std::string GetName() const { return filename_; }
};
class PosixWritableFile : public WritableFile {
private:
std::string filename_;
FILE* file_;
public:
PosixWritableFile(const std::string& fname, FILE* f)
: filename_(fname), file_(f) { }
~PosixWritableFile() {
if (file_ != NULL) {
// Ignoring any potential errors
fclose(file_);
}
}
virtual Status Append(const Slice& data) {
size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_);
if (r != data.size()) {
return IOError(filename_, errno);
}
return Status::OK();
}
virtual Status Close() {
Status result;
if (fclose(file_) != 0) {
result = IOError(filename_, errno);
}
file_ = NULL;
return result;
}
virtual Status Flush() {
if (fflush_unlocked(file_) != 0) {
return IOError(filename_, errno);
}
return Status::OK();
}
Status SyncDirIfManifest() {
const char* f = filename_.c_str();
const char* sep = strrchr(f, '/');
Slice basename;
std::string dir;
if (sep == NULL) {
dir = ".";
basename = f;
} else {
dir = std::string(f, sep - f);
basename = sep + 1;
}
Status s;
if (basename.starts_with("MANIFEST")) {
int fd = open(dir.c_str(), O_RDONLY);
if (fd < 0) {
s = IOError(dir, errno);
} else {
if (fsync(fd) < 0 && errno != EINVAL) {
s = IOError(dir, errno);
}
close(fd);
}
}
return s;
}
virtual Status Sync() {
// Ensure new files referred to by the manifest are in the filesystem.
Status s = SyncDirIfManifest();
if (!s.ok()) {
return s;
}
if (fflush_unlocked(file_) != 0 ||
fdatasync(fileno(file_)) != 0) {
s = Status::IOError(filename_, strerror(errno));
}
return s;
}
+
+ virtual std::string GetName() const { return filename_; }
};
static int LockOrUnlock(int fd, bool lock) {
errno = 0;
struct flock f;
memset(&f, 0, sizeof(f));
f.l_type = (lock ? F_WRLCK : F_UNLCK);
f.l_whence = SEEK_SET;
f.l_start = 0;
f.l_len = 0; // Lock/unlock entire file
return fcntl(fd, F_SETLK, &f);
}
class PosixFileLock : public FileLock {
public:
int fd_;
std::string name_;
};
// Set of locked files. We keep a separate set instead of just
// relying on fcntrl(F_SETLK) since fcntl(F_SETLK) does not provide
// any protection against multiple uses from the same process.
class PosixLockTable {
private:
port::Mutex mu_;
std::set<std::string> locked_files_;
public:
bool Insert(const std::string& fname) {
MutexLock l(&mu_);
return locked_files_.insert(fname).second;
}
void Remove(const std::string& fname) {
MutexLock l(&mu_);
locked_files_.erase(fname);
}
};
class PosixEnv : public Env {
public:
PosixEnv();
virtual ~PosixEnv() {
char msg[] = "Destroying Env::Default()\n";
fwrite(msg, 1, sizeof(msg), stderr);
abort();
}
virtual Status NewSequentialFile(const std::string& fname,
SequentialFile** result) {
FILE* f = fopen(fname.c_str(), "r");
if (f == NULL) {
*result = NULL;
return IOError(fname, errno);
} else {
*result = new PosixSequentialFile(fname, f);
return Status::OK();
}
}
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result) {
*result = NULL;
Status s;
int fd = open(fname.c_str(), O_RDONLY);
if (fd < 0) {
s = IOError(fname, errno);
} else if (mmap_limit_.Acquire()) {
uint64_t size;
s = GetFileSize(fname, &size);
if (s.ok()) {
void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
if (base != MAP_FAILED) {
*result = new PosixMmapReadableFile(fname, base, size, &mmap_limit_);
} else {
s = IOError(fname, errno);
}
}
close(fd);
if (!s.ok()) {
mmap_limit_.Release();
}
} else {
*result = new PosixRandomAccessFile(fname, fd, &fd_limit_);
}
return s;
}
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) {
Status s;
FILE* f = fopen(fname.c_str(), "w");
if (f == NULL) {
*result = NULL;
s = IOError(fname, errno);
} else {
*result = new PosixWritableFile(fname, f);
}
return s;
}
virtual Status NewAppendableFile(const std::string& fname,
WritableFile** result) {
Status s;
FILE* f = fopen(fname.c_str(), "a");
if (f == NULL) {
*result = NULL;
s = IOError(fname, errno);
} else {
*result = new PosixWritableFile(fname, f);
}
return s;
}
virtual bool FileExists(const std::string& fname) {
return access(fname.c_str(), F_OK) == 0;
}
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) {
result->clear();
DIR* d = opendir(dir.c_str());
if (d == NULL) {
return IOError(dir, errno);
}
struct dirent* entry;
while ((entry = readdir(d)) != NULL) {
result->push_back(entry->d_name);
}
closedir(d);
return Status::OK();
}
virtual Status DeleteFile(const std::string& fname) {
Status result;
if (unlink(fname.c_str()) != 0) {
result = IOError(fname, errno);
}
return result;
}
virtual Status CreateDir(const std::string& name) {
Status result;
if (mkdir(name.c_str(), 0755) != 0) {
result = IOError(name, errno);
}
return result;
}
virtual Status DeleteDir(const std::string& name) {
Status result;
if (rmdir(name.c_str()) != 0) {
result = IOError(name, errno);
}
return result;
}
virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
Status s;
struct stat sbuf;
if (stat(fname.c_str(), &sbuf) != 0) {
*size = 0;
s = IOError(fname, errno);
} else {
*size = sbuf.st_size;
}
return s;
}
virtual Status RenameFile(const std::string& src, const std::string& target) {
Status result;
if (rename(src.c_str(), target.c_str()) != 0) {
result = IOError(src, errno);
}
return result;
}
virtual Status LockFile(const std::string& fname, FileLock** lock) {
*lock = NULL;
Status result;
int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
if (fd < 0) {
result = IOError(fname, errno);
} else if (!locks_.Insert(fname)) {
close(fd);
result = Status::IOError("lock " + fname, "already held by process");
} else if (LockOrUnlock(fd, true) == -1) {
result = IOError("lock " + fname, errno);
close(fd);
locks_.Remove(fname);
} else {
PosixFileLock* my_lock = new PosixFileLock;
my_lock->fd_ = fd;
my_lock->name_ = fname;
*lock = my_lock;
}
return result;
}
virtual Status UnlockFile(FileLock* lock) {
PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
Status result;
if (LockOrUnlock(my_lock->fd_, false) == -1) {
result = IOError("unlock", errno);
}
locks_.Remove(my_lock->name_);
close(my_lock->fd_);
delete my_lock;
return result;
}
virtual void Schedule(void (*function)(void*), void* arg);
virtual void StartThread(void (*function)(void* arg), void* arg);
virtual Status GetTestDirectory(std::string* result) {
const char* env = getenv("TEST_TMPDIR");
if (env && env[0] != '\0') {
*result = env;
} else {
char buf[100];
snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid()));
*result = buf;
}
// Directory may already exist
CreateDir(*result);
return Status::OK();
}
static uint64_t gettid() {
pthread_t tid = pthread_self();
uint64_t thread_id = 0;
memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
return thread_id;
}
virtual Status NewLogger(const std::string& fname, Logger** result) {
FILE* f = fopen(fname.c_str(), "w");
if (f == NULL) {
*result = NULL;
return IOError(fname, errno);
} else {
*result = new PosixLogger(f, &PosixEnv::gettid);
return Status::OK();
}
}
virtual uint64_t NowMicros() {
struct timeval tv;
gettimeofday(&tv, NULL);
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
virtual void SleepForMicroseconds(int micros) {
usleep(micros);
}
private:
void PthreadCall(const char* label, int result) {
if (result != 0) {
fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
abort();
}
}
// BGThread() is the body of the background thread
void BGThread();
static void* BGThreadWrapper(void* arg) {
reinterpret_cast<PosixEnv*>(arg)->BGThread();
return NULL;
}
pthread_mutex_t mu_;
pthread_cond_t bgsignal_;
pthread_t bgthread_;
bool started_bgthread_;
// Entry per Schedule() call
struct BGItem { void* arg; void (*function)(void*); };
typedef std::deque<BGItem> BGQueue;
BGQueue queue_;
PosixLockTable locks_;
Limiter mmap_limit_;
Limiter fd_limit_;
};
// Return the maximum number of concurrent mmaps.
static int MaxMmaps() {
if (mmap_limit >= 0) {
return mmap_limit;
}
// Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes.
mmap_limit = sizeof(void*) >= 8 ? 1000 : 0;
return mmap_limit;
}
// Return the maximum number of read-only files to keep open.
static intptr_t MaxOpenFiles() {
if (open_read_only_file_limit >= 0) {
return open_read_only_file_limit;
}
struct rlimit rlim;
if (getrlimit(RLIMIT_NOFILE, &rlim)) {
// getrlimit failed, fallback to hard-coded default.
open_read_only_file_limit = 50;
} else if (rlim.rlim_cur == RLIM_INFINITY) {
open_read_only_file_limit = std::numeric_limits<int>::max();
} else {
// Allow use of 20% of available file descriptors for read-only files.
open_read_only_file_limit = rlim.rlim_cur / 5;
}
return open_read_only_file_limit;
}
PosixEnv::PosixEnv()
: started_bgthread_(false),
mmap_limit_(MaxMmaps()),
fd_limit_(MaxOpenFiles()) {
PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL));
PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL));
}
void PosixEnv::Schedule(void (*function)(void*), void* arg) {
PthreadCall("lock", pthread_mutex_lock(&mu_));
// Start background thread if necessary
if (!started_bgthread_) {
started_bgthread_ = true;
PthreadCall(
"create thread",
pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this));
}
// If the queue is currently empty, the background thread may currently be
// waiting.
if (queue_.empty()) {
PthreadCall("signal", pthread_cond_signal(&bgsignal_));
}
// Add to priority queue
queue_.push_back(BGItem());
queue_.back().function = function;
queue_.back().arg = arg;
PthreadCall("unlock", pthread_mutex_unlock(&mu_));
}
void PosixEnv::BGThread() {
while (true) {
// Wait until there is an item that is ready to run
PthreadCall("lock", pthread_mutex_lock(&mu_));
while (queue_.empty()) {
PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
}
void (*function)(void*) = queue_.front().function;
void* arg = queue_.front().arg;
queue_.pop_front();
PthreadCall("unlock", pthread_mutex_unlock(&mu_));
(*function)(arg);
}
}
namespace {
struct StartThreadState {
void (*user_function)(void*);
void* arg;
};
}
static void* StartThreadWrapper(void* arg) {
StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
state->user_function(state->arg);
delete state;
return NULL;
}
void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
pthread_t t;
StartThreadState* state = new StartThreadState;
state->user_function = function;
state->arg = arg;
PthreadCall("start thread",
pthread_create(&t, NULL, &StartThreadWrapper, state));
}
} // namespace
static pthread_once_t once = PTHREAD_ONCE_INIT;
static Env* default_env;
static void InitDefaultEnv() { default_env = new PosixEnv; }
void EnvPosixTestHelper::SetReadOnlyFDLimit(int limit) {
assert(default_env == NULL);
open_read_only_file_limit = limit;
}
void EnvPosixTestHelper::SetReadOnlyMMapLimit(int limit) {
assert(default_env == NULL);
mmap_limit = limit;
}
Env* Env::Default() {
pthread_once(&once, InitDefaultEnv);
return default_env;
}
} // namespace leveldb
#endif
diff --git a/src/leveldb/util/env_win.cc b/src/leveldb/util/env_win.cc
index d32c4e676..81380216b 100644
--- a/src/leveldb/util/env_win.cc
+++ b/src/leveldb/util/env_win.cc
@@ -1,898 +1,901 @@
// This file contains source that originates from:
// http://code.google.com/p/leveldbwin/source/browse/trunk/win32_impl_src/env_win32.h
// http://code.google.com/p/leveldbwin/source/browse/trunk/win32_impl_src/port_win32.cc
// Those files don't have any explicit license headers but the
// project (http://code.google.com/p/leveldbwin/) lists the 'New BSD License'
// as the license.
#if defined(LEVELDB_PLATFORM_WINDOWS)
#include <map>
#include "leveldb/env.h"
#include "port/port.h"
#include "leveldb/slice.h"
#include "util/logging.h"
#include <shlwapi.h>
#include <process.h>
#include <cstring>
#include <stdio.h>
#include <errno.h>
#include <io.h>
#include <algorithm>
#ifdef max
#undef max
#endif
#ifndef va_copy
#define va_copy(d,s) ((d) = (s))
#endif
#if defined DeleteFile
#undef DeleteFile
#endif
//Declarations
namespace leveldb
{
namespace Win32
{
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&); \
void operator=(const TypeName&)
std::string GetCurrentDir();
std::wstring GetCurrentDirW();
static const std::string CurrentDir = GetCurrentDir();
static const std::wstring CurrentDirW = GetCurrentDirW();
std::string& ModifyPath(std::string& path);
std::wstring& ModifyPath(std::wstring& path);
std::string GetLastErrSz();
std::wstring GetLastErrSzW();
size_t GetPageSize();
typedef void (*ScheduleProc)(void*) ;
struct WorkItemWrapper
{
WorkItemWrapper(ScheduleProc proc_,void* content_);
ScheduleProc proc;
void* pContent;
};
DWORD WINAPI WorkItemWrapperProc(LPVOID pContent);
class Win32SequentialFile : public SequentialFile
{
public:
friend class Win32Env;
virtual ~Win32SequentialFile();
virtual Status Read(size_t n, Slice* result, char* scratch);
virtual Status Skip(uint64_t n);
BOOL isEnable();
+ virtual std::string GetName() const { return _filename; }
private:
BOOL _Init();
void _CleanUp();
Win32SequentialFile(const std::string& fname);
std::string _filename;
::HANDLE _hFile;
DISALLOW_COPY_AND_ASSIGN(Win32SequentialFile);
};
class Win32RandomAccessFile : public RandomAccessFile
{
public:
friend class Win32Env;
virtual ~Win32RandomAccessFile();
virtual Status Read(uint64_t offset, size_t n, Slice* result,char* scratch) const;
BOOL isEnable();
+ virtual std::string GetName() const { return _filename; }
private:
BOOL _Init(LPCWSTR path);
void _CleanUp();
Win32RandomAccessFile(const std::string& fname);
HANDLE _hFile;
const std::string _filename;
DISALLOW_COPY_AND_ASSIGN(Win32RandomAccessFile);
};
class Win32WritableFile : public WritableFile
{
public:
Win32WritableFile(const std::string& fname, bool append);
~Win32WritableFile();
virtual Status Append(const Slice& data);
virtual Status Close();
virtual Status Flush();
virtual Status Sync();
BOOL isEnable();
+ virtual std::string GetName() const { return filename_; }
private:
std::string filename_;
::HANDLE _hFile;
};
class Win32FileLock : public FileLock
{
public:
friend class Win32Env;
virtual ~Win32FileLock();
BOOL isEnable();
private:
BOOL _Init(LPCWSTR path);
void _CleanUp();
Win32FileLock(const std::string& fname);
HANDLE _hFile;
std::string _filename;
DISALLOW_COPY_AND_ASSIGN(Win32FileLock);
};
class Win32Logger : public Logger
{
public:
friend class Win32Env;
virtual ~Win32Logger();
virtual void Logv(const char* format, va_list ap);
private:
explicit Win32Logger(WritableFile* pFile);
WritableFile* _pFileProxy;
DISALLOW_COPY_AND_ASSIGN(Win32Logger);
};
class Win32Env : public Env
{
public:
Win32Env();
virtual ~Win32Env();
virtual Status NewSequentialFile(const std::string& fname,
SequentialFile** result);
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result);
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result);
virtual Status NewAppendableFile(const std::string& fname,
WritableFile** result);
virtual bool FileExists(const std::string& fname);
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result);
virtual Status DeleteFile(const std::string& fname);
virtual Status CreateDir(const std::string& dirname);
virtual Status DeleteDir(const std::string& dirname);
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size);
virtual Status RenameFile(const std::string& src,
const std::string& target);
virtual Status LockFile(const std::string& fname, FileLock** lock);
virtual Status UnlockFile(FileLock* lock);
virtual void Schedule(
void (*function)(void* arg),
void* arg);
virtual void StartThread(void (*function)(void* arg), void* arg);
virtual Status GetTestDirectory(std::string* path);
//virtual void Logv(WritableFile* log, const char* format, va_list ap);
virtual Status NewLogger(const std::string& fname, Logger** result);
virtual uint64_t NowMicros();
virtual void SleepForMicroseconds(int micros);
};
void ToWidePath(const std::string& value, std::wstring& target) {
wchar_t buffer[MAX_PATH];
MultiByteToWideChar(CP_ACP, 0, value.c_str(), -1, buffer, MAX_PATH);
target = buffer;
}
void ToNarrowPath(const std::wstring& value, std::string& target) {
char buffer[MAX_PATH];
WideCharToMultiByte(CP_ACP, 0, value.c_str(), -1, buffer, MAX_PATH, NULL, NULL);
target = buffer;
}
std::string GetCurrentDir()
{
CHAR path[MAX_PATH];
::GetModuleFileNameA(::GetModuleHandleA(NULL),path,MAX_PATH);
*strrchr(path,'\\') = 0;
return std::string(path);
}
std::wstring GetCurrentDirW()
{
WCHAR path[MAX_PATH];
::GetModuleFileNameW(::GetModuleHandleW(NULL),path,MAX_PATH);
*wcsrchr(path,L'\\') = 0;
return std::wstring(path);
}
std::string& ModifyPath(std::string& path)
{
if(path[0] == '/' || path[0] == '\\'){
path = CurrentDir + path;
}
std::replace(path.begin(),path.end(),'/','\\');
return path;
}
std::wstring& ModifyPath(std::wstring& path)
{
if(path[0] == L'/' || path[0] == L'\\'){
path = CurrentDirW + path;
}
std::replace(path.begin(),path.end(),L'/',L'\\');
return path;
}
std::string GetLastErrSz()
{
LPWSTR lpMsgBuf;
FormatMessageW(
FORMAT_MESSAGE_ALLOCATE_BUFFER |
FORMAT_MESSAGE_FROM_SYSTEM |
FORMAT_MESSAGE_IGNORE_INSERTS,
NULL,
GetLastError(),
0, // Default language
(LPWSTR) &lpMsgBuf,
0,
NULL
);
std::string Err;
ToNarrowPath(lpMsgBuf, Err);
LocalFree( lpMsgBuf );
return Err;
}
std::wstring GetLastErrSzW()
{
LPVOID lpMsgBuf;
FormatMessageW(
FORMAT_MESSAGE_ALLOCATE_BUFFER |
FORMAT_MESSAGE_FROM_SYSTEM |
FORMAT_MESSAGE_IGNORE_INSERTS,
NULL,
GetLastError(),
0, // Default language
(LPWSTR) &lpMsgBuf,
0,
NULL
);
std::wstring Err = (LPCWSTR)lpMsgBuf;
LocalFree(lpMsgBuf);
return Err;
}
WorkItemWrapper::WorkItemWrapper( ScheduleProc proc_,void* content_ ) :
proc(proc_),pContent(content_)
{
}
DWORD WINAPI WorkItemWrapperProc(LPVOID pContent)
{
WorkItemWrapper* item = static_cast<WorkItemWrapper*>(pContent);
ScheduleProc TempProc = item->proc;
void* arg = item->pContent;
delete item;
TempProc(arg);
return 0;
}
size_t GetPageSize()
{
SYSTEM_INFO si;
GetSystemInfo(&si);
return std::max(si.dwPageSize,si.dwAllocationGranularity);
}
const size_t g_PageSize = GetPageSize();
Win32SequentialFile::Win32SequentialFile( const std::string& fname ) :
_filename(fname),_hFile(NULL)
{
_Init();
}
Win32SequentialFile::~Win32SequentialFile()
{
_CleanUp();
}
Status Win32SequentialFile::Read( size_t n, Slice* result, char* scratch )
{
Status sRet;
DWORD hasRead = 0;
if(_hFile && ReadFile(_hFile,scratch,n,&hasRead,NULL) ){
*result = Slice(scratch,hasRead);
} else {
sRet = Status::IOError(_filename, Win32::GetLastErrSz() );
}
return sRet;
}
Status Win32SequentialFile::Skip( uint64_t n )
{
Status sRet;
LARGE_INTEGER Move,NowPointer;
Move.QuadPart = n;
if(!SetFilePointerEx(_hFile,Move,&NowPointer,FILE_CURRENT)){
sRet = Status::IOError(_filename,Win32::GetLastErrSz());
}
return sRet;
}
BOOL Win32SequentialFile::isEnable()
{
return _hFile ? TRUE : FALSE;
}
BOOL Win32SequentialFile::_Init()
{
std::wstring path;
ToWidePath(_filename, path);
_hFile = CreateFileW(path.c_str(),
GENERIC_READ,
FILE_SHARE_READ | FILE_SHARE_WRITE,
NULL,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN,
NULL);
if (_hFile == INVALID_HANDLE_VALUE)
_hFile = NULL;
return _hFile ? TRUE : FALSE;
}
void Win32SequentialFile::_CleanUp()
{
if(_hFile){
CloseHandle(_hFile);
_hFile = NULL;
}
}
Win32RandomAccessFile::Win32RandomAccessFile( const std::string& fname ) :
_filename(fname),_hFile(NULL)
{
std::wstring path;
ToWidePath(fname, path);
_Init( path.c_str() );
}
Win32RandomAccessFile::~Win32RandomAccessFile()
{
_CleanUp();
}
Status Win32RandomAccessFile::Read(uint64_t offset,size_t n,Slice* result,char* scratch) const
{
Status sRet;
OVERLAPPED ol = {0};
ZeroMemory(&ol,sizeof(ol));
ol.Offset = (DWORD)offset;
ol.OffsetHigh = (DWORD)(offset >> 32);
DWORD hasRead = 0;
if(!ReadFile(_hFile,scratch,n,&hasRead,&ol))
sRet = Status::IOError(_filename,Win32::GetLastErrSz());
else
*result = Slice(scratch,hasRead);
return sRet;
}
BOOL Win32RandomAccessFile::_Init( LPCWSTR path )
{
BOOL bRet = FALSE;
if(!_hFile)
_hFile = ::CreateFileW(path,GENERIC_READ,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL | FILE_FLAG_RANDOM_ACCESS,NULL);
if(!_hFile || _hFile == INVALID_HANDLE_VALUE )
_hFile = NULL;
else
bRet = TRUE;
return bRet;
}
BOOL Win32RandomAccessFile::isEnable()
{
return _hFile ? TRUE : FALSE;
}
void Win32RandomAccessFile::_CleanUp()
{
if(_hFile){
::CloseHandle(_hFile);
_hFile = NULL;
}
}
Win32WritableFile::Win32WritableFile(const std::string& fname, bool append)
: filename_(fname)
{
std::wstring path;
ToWidePath(fname, path);
// NewAppendableFile: append to an existing file, or create a new one
// if none exists - this is OPEN_ALWAYS behavior, with
// FILE_APPEND_DATA to avoid having to manually position the file
// pointer at the end of the file.
// NewWritableFile: create a new file, delete if it exists - this is
// CREATE_ALWAYS behavior. This file is used for writing only so
// use GENERIC_WRITE.
_hFile = CreateFileW(path.c_str(),
append ? FILE_APPEND_DATA : GENERIC_WRITE,
FILE_SHARE_READ|FILE_SHARE_DELETE|FILE_SHARE_WRITE,
NULL,
append ? OPEN_ALWAYS : CREATE_ALWAYS,
FILE_ATTRIBUTE_NORMAL,
NULL);
// CreateFileW returns INVALID_HANDLE_VALUE in case of error, always check isEnable() before use
}
Win32WritableFile::~Win32WritableFile()
{
if (_hFile != INVALID_HANDLE_VALUE)
Close();
}
Status Win32WritableFile::Append(const Slice& data)
{
DWORD r = 0;
if (!WriteFile(_hFile, data.data(), data.size(), &r, NULL) || r != data.size()) {
return Status::IOError("Win32WritableFile.Append::WriteFile: "+filename_, Win32::GetLastErrSz());
}
return Status::OK();
}
Status Win32WritableFile::Close()
{
if (!CloseHandle(_hFile)) {
return Status::IOError("Win32WritableFile.Close::CloseHandle: "+filename_, Win32::GetLastErrSz());
}
_hFile = INVALID_HANDLE_VALUE;
return Status::OK();
}
Status Win32WritableFile::Flush()
{
// Nothing to do here, there are no application-side buffers
return Status::OK();
}
Status Win32WritableFile::Sync()
{
if (!FlushFileBuffers(_hFile)) {
return Status::IOError("Win32WritableFile.Sync::FlushFileBuffers "+filename_, Win32::GetLastErrSz());
}
return Status::OK();
}
BOOL Win32WritableFile::isEnable()
{
return _hFile != INVALID_HANDLE_VALUE;
}
Win32FileLock::Win32FileLock( const std::string& fname ) :
_hFile(NULL),_filename(fname)
{
std::wstring path;
ToWidePath(fname, path);
_Init(path.c_str());
}
Win32FileLock::~Win32FileLock()
{
_CleanUp();
}
BOOL Win32FileLock::_Init( LPCWSTR path )
{
BOOL bRet = FALSE;
if(!_hFile)
_hFile = ::CreateFileW(path,0,0,NULL,CREATE_ALWAYS,FILE_ATTRIBUTE_NORMAL,NULL);
if(!_hFile || _hFile == INVALID_HANDLE_VALUE ){
_hFile = NULL;
}
else
bRet = TRUE;
return bRet;
}
void Win32FileLock::_CleanUp()
{
::CloseHandle(_hFile);
_hFile = NULL;
}
BOOL Win32FileLock::isEnable()
{
return _hFile ? TRUE : FALSE;
}
Win32Logger::Win32Logger(WritableFile* pFile) : _pFileProxy(pFile)
{
assert(_pFileProxy);
}
Win32Logger::~Win32Logger()
{
if(_pFileProxy)
delete _pFileProxy;
}
void Win32Logger::Logv( const char* format, va_list ap )
{
uint64_t thread_id = ::GetCurrentThreadId();
// We try twice: the first time with a fixed-size stack allocated buffer,
// and the second time with a much larger dynamically allocated buffer.
char buffer[500];
for (int iter = 0; iter < 2; iter++) {
char* base;
int bufsize;
if (iter == 0) {
bufsize = sizeof(buffer);
base = buffer;
} else {
bufsize = 30000;
base = new char[bufsize];
}
char* p = base;
char* limit = base + bufsize;
SYSTEMTIME st;
GetLocalTime(&st);
p += snprintf(p, limit - p,
"%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
int(st.wYear),
int(st.wMonth),
int(st.wDay),
int(st.wHour),
int(st.wMinute),
int(st.wMinute),
int(st.wMilliseconds),
static_cast<long long unsigned int>(thread_id));
// Print the message
if (p < limit) {
va_list backup_ap;
va_copy(backup_ap, ap);
p += vsnprintf(p, limit - p, format, backup_ap);
va_end(backup_ap);
}
// Truncate to available space if necessary
if (p >= limit) {
if (iter == 0) {
continue; // Try again with larger buffer
} else {
p = limit - 1;
}
}
// Add newline if necessary
if (p == base || p[-1] != '\n') {
*p++ = '\n';
}
assert(p <= limit);
DWORD hasWritten = 0;
if(_pFileProxy){
_pFileProxy->Append(Slice(base, p - base));
_pFileProxy->Flush();
}
if (base != buffer) {
delete[] base;
}
break;
}
}
bool Win32Env::FileExists(const std::string& fname)
{
std::string path = fname;
std::wstring wpath;
ToWidePath(ModifyPath(path), wpath);
return ::PathFileExistsW(wpath.c_str()) ? true : false;
}
Status Win32Env::GetChildren(const std::string& dir, std::vector<std::string>* result)
{
Status sRet;
::WIN32_FIND_DATAW wfd;
std::string path = dir;
ModifyPath(path);
path += "\\*.*";
std::wstring wpath;
ToWidePath(path, wpath);
::HANDLE hFind = ::FindFirstFileW(wpath.c_str() ,&wfd);
if(hFind && hFind != INVALID_HANDLE_VALUE){
BOOL hasNext = TRUE;
std::string child;
while(hasNext){
ToNarrowPath(wfd.cFileName, child);
if(child != ".." && child != ".") {
result->push_back(child);
}
hasNext = ::FindNextFileW(hFind,&wfd);
}
::FindClose(hFind);
}
else
sRet = Status::IOError(dir,"Could not get children.");
return sRet;
}
void Win32Env::SleepForMicroseconds( int micros )
{
::Sleep((micros + 999) /1000);
}
Status Win32Env::DeleteFile( const std::string& fname )
{
Status sRet;
std::string path = fname;
std::wstring wpath;
ToWidePath(ModifyPath(path), wpath);
if(!::DeleteFileW(wpath.c_str())) {
sRet = Status::IOError(path, "Could not delete file.");
}
return sRet;
}
Status Win32Env::GetFileSize( const std::string& fname, uint64_t* file_size )
{
Status sRet;
std::string path = fname;
std::wstring wpath;
ToWidePath(ModifyPath(path), wpath);
HANDLE file = ::CreateFileW(wpath.c_str(),
GENERIC_READ,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL);
LARGE_INTEGER li;
if(::GetFileSizeEx(file,&li)){
*file_size = (uint64_t)li.QuadPart;
}else
sRet = Status::IOError(path,"Could not get the file size.");
CloseHandle(file);
return sRet;
}
Status Win32Env::RenameFile( const std::string& src, const std::string& target )
{
Status sRet;
std::string src_path = src;
std::wstring wsrc_path;
ToWidePath(ModifyPath(src_path), wsrc_path);
std::string target_path = target;
std::wstring wtarget_path;
ToWidePath(ModifyPath(target_path), wtarget_path);
if(!MoveFileW(wsrc_path.c_str(), wtarget_path.c_str() ) ){
DWORD err = GetLastError();
if(err == 0x000000b7){
if(!::DeleteFileW(wtarget_path.c_str() ) )
sRet = Status::IOError(src, "Could not rename file.");
else if(!::MoveFileW(wsrc_path.c_str(),
wtarget_path.c_str() ) )
sRet = Status::IOError(src, "Could not rename file.");
}
}
return sRet;
}
Status Win32Env::LockFile( const std::string& fname, FileLock** lock )
{
Status sRet;
std::string path = fname;
ModifyPath(path);
Win32FileLock* _lock = new Win32FileLock(path);
if(!_lock->isEnable()){
delete _lock;
*lock = NULL;
sRet = Status::IOError(path, "Could not lock file.");
}
else
*lock = _lock;
return sRet;
}
Status Win32Env::UnlockFile( FileLock* lock )
{
Status sRet;
delete lock;
return sRet;
}
void Win32Env::Schedule( void (*function)(void* arg), void* arg )
{
QueueUserWorkItem(Win32::WorkItemWrapperProc,
new Win32::WorkItemWrapper(function,arg),
WT_EXECUTEDEFAULT);
}
void Win32Env::StartThread( void (*function)(void* arg), void* arg )
{
::_beginthread(function,0,arg);
}
Status Win32Env::GetTestDirectory( std::string* path )
{
Status sRet;
WCHAR TempPath[MAX_PATH];
::GetTempPathW(MAX_PATH,TempPath);
ToNarrowPath(TempPath, *path);
path->append("leveldb\\test\\");
ModifyPath(*path);
return sRet;
}
uint64_t Win32Env::NowMicros()
{
#ifndef USE_VISTA_API
#define GetTickCount64 GetTickCount
#endif
return (uint64_t)(GetTickCount64()*1000);
}
static Status CreateDirInner( const std::string& dirname )
{
Status sRet;
DWORD attr = ::GetFileAttributes(dirname.c_str());
if (attr == INVALID_FILE_ATTRIBUTES) { // doesn't exist:
std::size_t slash = dirname.find_last_of("\\");
if (slash != std::string::npos){
sRet = CreateDirInner(dirname.substr(0, slash));
if (!sRet.ok()) return sRet;
}
BOOL result = ::CreateDirectory(dirname.c_str(), NULL);
if (result == FALSE) {
sRet = Status::IOError(dirname, "Could not create directory.");
return sRet;
}
}
return sRet;
}
Status Win32Env::CreateDir( const std::string& dirname )
{
std::string path = dirname;
if(path[path.length() - 1] != '\\'){
path += '\\';
}
ModifyPath(path);
return CreateDirInner(path);
}
Status Win32Env::DeleteDir( const std::string& dirname )
{
Status sRet;
std::wstring path;
ToWidePath(dirname, path);
ModifyPath(path);
if(!::RemoveDirectoryW( path.c_str() ) ){
sRet = Status::IOError(dirname, "Could not delete directory.");
}
return sRet;
}
Status Win32Env::NewSequentialFile( const std::string& fname, SequentialFile** result )
{
Status sRet;
std::string path = fname;
ModifyPath(path);
Win32SequentialFile* pFile = new Win32SequentialFile(path);
if(pFile->isEnable()){
*result = pFile;
}else {
delete pFile;
sRet = Status::IOError(path, Win32::GetLastErrSz());
}
return sRet;
}
Status Win32Env::NewRandomAccessFile( const std::string& fname, RandomAccessFile** result )
{
Status sRet;
std::string path = fname;
Win32RandomAccessFile* pFile = new Win32RandomAccessFile(ModifyPath(path));
if(!pFile->isEnable()){
delete pFile;
*result = NULL;
sRet = Status::IOError(path, Win32::GetLastErrSz());
}else
*result = pFile;
return sRet;
}
Status Win32Env::NewLogger( const std::string& fname, Logger** result )
{
Status sRet;
std::string path = fname;
// Logs are opened with write semantics, not with append semantics
// (see PosixEnv::NewLogger)
Win32WritableFile* pMapFile = new Win32WritableFile(ModifyPath(path), false);
if(!pMapFile->isEnable()){
delete pMapFile;
*result = NULL;
sRet = Status::IOError(path,"could not create a logger.");
}else
*result = new Win32Logger(pMapFile);
return sRet;
}
Status Win32Env::NewWritableFile( const std::string& fname, WritableFile** result )
{
Status sRet;
std::string path = fname;
Win32WritableFile* pFile = new Win32WritableFile(ModifyPath(path), false);
if(!pFile->isEnable()){
*result = NULL;
sRet = Status::IOError(fname,Win32::GetLastErrSz());
}else
*result = pFile;
return sRet;
}
Status Win32Env::NewAppendableFile( const std::string& fname, WritableFile** result )
{
Status sRet;
std::string path = fname;
Win32WritableFile* pFile = new Win32WritableFile(ModifyPath(path), true);
if(!pFile->isEnable()){
*result = NULL;
sRet = Status::IOError(fname,Win32::GetLastErrSz());
}else
*result = pFile;
return sRet;
}
Win32Env::Win32Env()
{
}
Win32Env::~Win32Env()
{
}
} // Win32 namespace
static port::OnceType once = LEVELDB_ONCE_INIT;
static Env* default_env;
static void InitDefaultEnv() { default_env = new Win32::Win32Env(); }
Env* Env::Default() {
port::InitOnce(&once, InitDefaultEnv);
return default_env;
}
} // namespace leveldb
#endif // defined(LEVELDB_PLATFORM_WINDOWS)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Wed, Jan 29, 17:11 (11 h, 18 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
5053391
Default Alt Text
(144 KB)
Attached To
rABC Bitcoin ABC
Event Timeline
Log In to Comment