Page MenuHomePhabricator

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
diff --git a/src/leveldb/Makefile b/src/leveldb/Makefile
old mode 100755
new mode 100644
index 14e494f3c..fdc66f9a2
--- a/src/leveldb/Makefile
+++ b/src/leveldb/Makefile
@@ -1,200 +1,199 @@
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. See the AUTHORS file for names of contributors.
-# Inherit some settings from environment variables, if available
-INSTALL_PATH ?= $(CURDIR)
-
#-----------------------------------------------
# Uncomment exactly one of the lines labelled (A), (B), and (C) below
# to switch between compilation modes.
OPT ?= -O2 -DNDEBUG # (A) Production use (optimized mode)
# OPT ?= -g2 # (B) Debug mode, w/ full line-level debugging symbols
# OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
#-----------------------------------------------
# detect what platform we're building on
-$(shell ./build_detect_platform build_config.mk)
+$(shell CC=$(CC) CXX=$(CXX) TARGET_OS=$(TARGET_OS) \
+ ./build_detect_platform build_config.mk ./)
# this file is generated by the previous line to set build flags and sources
include build_config.mk
-xCFLAGS = -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) $(CFLAGS)
-xCXXFLAGS = -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) $(CXXFLAGS)
+CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
+CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT)
-xLDFLAGS = $(PLATFORM_LDFLAGS) $(LDFLAGS)
+LDFLAGS += $(PLATFORM_LDFLAGS)
+LIBS += $(PLATFORM_LIBS)
LIBOBJECTS = $(SOURCES:.cc=.o)
MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
TESTUTIL = ./util/testutil.o
TESTHARNESS = ./util/testharness.o $(TESTUTIL)
TESTS = \
arena_test \
bloom_test \
c_test \
cache_test \
coding_test \
corruption_test \
crc32c_test \
db_test \
dbformat_test \
env_test \
filename_test \
filter_block_test \
log_test \
memenv_test \
skiplist_test \
table_test \
version_edit_test \
version_set_test \
write_batch_test
PROGRAMS = db_bench $(TESTS)
BENCHMARKS = db_bench_sqlite3 db_bench_tree_db
LIBRARY = libleveldb.a
MEMENVLIBRARY = libmemenv.a
default: all
# Should we build shared libraries?
ifneq ($(PLATFORM_SHARED_EXT),)
ifneq ($(PLATFORM_SHARED_VERSIONED),true)
SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1)
SHARED3 = $(SHARED1)
SHARED = $(SHARED1)
else
# Update db.h if you change these.
SHARED_MAJOR = 1
-SHARED_MINOR = 5
+SHARED_MINOR = 7
SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1).$(SHARED_MAJOR)
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
$(SHARED1): $(SHARED3)
ln -fs $(SHARED3) $(SHARED1)
$(SHARED2): $(SHARED3)
ln -fs $(SHARED3) $(SHARED2)
endif
$(SHARED3):
- $(CXX) $(xLDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(xCXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) $(PLATFORM_EXTRALIBS) -o $(SHARED3)
+ $(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3) $(LIBS)
endif # PLATFORM_SHARED_EXT
all: $(SHARED) $(LIBRARY)
check: all $(PROGRAMS) $(TESTS)
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
clean:
-rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) */*.o */*/*.o ios-x86/*/*.o ios-arm/*/*.o build_config.mk
-rm -rf ios-x86/* ios-arm/*
$(LIBRARY): $(LIBOBJECTS)
rm -f $@
$(AR) -rs $@ $(LIBOBJECTS)
db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
- $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(LIBS)
db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL)
- $(CXX) doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS) -lsqlite3
+ $(CXX) $(LDFLAGS) doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ -lsqlite3 $(LIBS)
db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL)
- $(CXX) doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS) -lkyotocabinet
+ $(CXX) $(LDFLAGS) doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) -o $@ -lkyotocabinet $(LIBS)
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
- $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
rm -f $@
$(AR) -rs $@ $(MEMENVOBJECTS)
memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
- $(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(xLDFLAGS) $(PLATFORM_EXTRALIBS)
+ $(CXX) $(LDFLAGS) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LIBS)
ifeq ($(PLATFORM), IOS)
# For iOS, create universal object files to be used on both the simulator and
# a device.
PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
.cc.o:
mkdir -p ios-x86/$(dir $@)
- $(SIMULATORROOT)/usr/bin/$(CXX) $(xCXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
+ $(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@)
- $(DEVICEROOT)/usr/bin/$(CXX) $(xCXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
+ $(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
lipo ios-x86/$@ ios-arm/$@ -create -output $@
.c.o:
mkdir -p ios-x86/$(dir $@)
- $(SIMULATORROOT)/usr/bin/$(CC) $(xCFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
+ $(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@)
- $(DEVICEROOT)/usr/bin/$(CC) $(xCFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
+ $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
lipo ios-x86/$@ ios-arm/$@ -create -output $@
else
.cc.o:
- $(CXX) $(xCXXFLAGS) -c $< -o $@
+ $(CXX) $(CXXFLAGS) -c $< -o $@
.c.o:
- $(CC) $(xCFLAGS) -c $< -o $@
+ $(CC) $(CFLAGS) -c $< -o $@
endif
diff --git a/src/leveldb/NEWS b/src/leveldb/NEWS
new file mode 100644
index 000000000..3fd99242d
--- /dev/null
+++ b/src/leveldb/NEWS
@@ -0,0 +1,17 @@
+Release 1.2 2011-05-16
+----------------------
+
+Fixes for larger databases (tested up to one billion 100-byte entries,
+i.e., ~100GB).
+
+(1) Place hard limit on number of level-0 files. This fixes errors
+of the form "too many open files".
+
+(2) Fixed memtable management. Before the fix, a heavy write burst
+could cause unbounded memory usage.
+
+A fix for a logging bug where the reader would incorrectly complain
+about corruption.
+
+Allow public access to WriteBatch contents so that users can easily
+wrap a DB.
diff --git a/src/leveldb/README b/src/leveldb/README
index 2bf787ef2..3618adeee 100644
--- a/src/leveldb/README
+++ b/src/leveldb/README
@@ -1,59 +1,51 @@
-LevelDB is a third party library used for the transaction database.
-It is imported into the Bitcoin codebase due to being relatively new
-and not widely packaged.
-
-
-
----------------------------------------------------------------------
-
leveldb: A key-value store
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
The code under this directory implements a system for maintaining a
persistent key/value store.
See doc/index.html for more explanation.
See doc/impl.html for a brief overview of the implementation.
The public interface is in include/*.h. Callers should not include or
rely on the details of any other header files in this package. Those
internal APIs may be changed without warning.
Guide to header files:
include/db.h
Main interface to the DB: Start here
include/options.h
Control over the behavior of an entire database, and also
control over the behavior of individual reads and writes.
include/comparator.h
Abstraction for user-specified comparison function. If you want
just bytewise comparison of keys, you can use the default comparator,
but clients can write their own comparator implementations if they
want custom ordering (e.g. to handle different character
encodings, etc.)
include/iterator.h
Interface for iterating over data. You can get an iterator
from a DB object.
include/write_batch.h
Interface for atomically applying multiple updates to a database.
include/slice.h
A simple module for maintaining a pointer and a length into some
other byte array.
include/status.h
Status is returned from many of the public interfaces and is used
to report success and various kinds of errors.
include/env.h
Abstraction of the OS environment. A posix implementation of
this interface is in util/env_posix.cc
include/table.h
include/table_builder.h
Lower-level modules that most clients probably won't use directly
diff --git a/src/leveldb/TODO b/src/leveldb/TODO
index 9130b6a9f..e603c0713 100644
--- a/src/leveldb/TODO
+++ b/src/leveldb/TODO
@@ -1,13 +1,14 @@
ss
- Stats
db
- Maybe implement DB::BulkDeleteForRange(start_key, end_key)
that would blow away files whose ranges are entirely contained
within [start_key..end_key]? For Chrome, deletion of obsolete
object stores, etc. can be done in the background anyway, so
probably not that important.
+- There have been requests for MultiGet.
After a range is completely deleted, what gets rid of the
corresponding files if we do no future changes to that range. Make
the conditions for triggering compactions fire in more situations?
diff --git a/src/leveldb/build_detect_platform b/src/leveldb/build_detect_platform
index 566d1f83a..7cdb793a1 100755
--- a/src/leveldb/build_detect_platform
+++ b/src/leveldb/build_detect_platform
@@ -1,191 +1,202 @@
#!/bin/sh
#
# Detects OS we're compiling on and outputs a file specified by the first
# argument, which in turn gets read while processing Makefile.
#
# The output will set the following variables:
# CC C Compiler path
# CXX C++ Compiler path
# PLATFORM_LDFLAGS Linker flags
+# PLATFORM_LIBS Libraries flags
# PLATFORM_SHARED_EXT Extension for shared libraries
# PLATFORM_SHARED_LDFLAGS Flags for building shared library
+# This flag is embedded just before the name
+# of the shared library without intervening spaces
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
# PLATFORM_CCFLAGS C compiler flags
# PLATFORM_CXXFLAGS C++ compiler flags. Will contain:
# PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned
# shared libraries, empty otherwise.
#
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
#
# -DLEVELDB_CSTDATOMIC_PRESENT if <cstdatomic> is present
# -DLEVELDB_PLATFORM_POSIX for Posix-based platforms
# -DSNAPPY if the Snappy library is present
#
OUTPUT=$1
-if test -z "$OUTPUT"; then
- echo "usage: $0 <output-filename>" >&2
+PREFIX=$2
+if test -z "$OUTPUT" || test -z "$PREFIX"; then
+ echo "usage: $0 <output-filename> <directory_prefix>" >&2
exit 1
fi
# Delete existing output, if it exists
rm -f $OUTPUT
touch $OUTPUT
if test -z "$CC"; then
CC=cc
fi
if test -z "$CXX"; then
CXX=g++
fi
# Detect OS
if test -z "$TARGET_OS"; then
TARGET_OS=`uname -s`
fi
COMMON_FLAGS=
CROSS_COMPILE=
PLATFORM_CCFLAGS=
PLATFORM_CXXFLAGS=
PLATFORM_LDFLAGS=
-PLATFORM_EXTRALIBS=
-PLATFORM_SOURCES=
+PLATFORM_LIBS=
PLATFORM_SHARED_EXT="so"
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
PLATFORM_SHARED_CFLAGS="-fPIC"
PLATFORM_SHARED_VERSIONED=true
-# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
+MEMCMP_FLAG=
+if [ "$CXX" = "g++" ]; then
+ # Use libc's memcmp instead of GCC's memcmp. This results in ~40%
+ # performance improvement on readrandom under gcc 4.4.3 on Linux/x86.
+ MEMCMP_FLAG="-fno-builtin-memcmp"
+fi
+
case "$TARGET_OS" in
Darwin)
PLATFORM=OS_MACOSX
- COMMON_FLAGS="-fno-builtin-memcmp -DOS_MACOSX"
+ COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
PLATFORM_SHARED_EXT=dylib
- PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
- PLATFORM_SOURCES="port/port_posix.cc util/env_posix.cc"
+ [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
+ PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
+ PORT_FILE=port/port_posix.cc
;;
Linux)
PLATFORM=OS_LINUX
- COMMON_FLAGS="-fno-builtin-memcmp -pthread -DOS_LINUX"
+ COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
PLATFORM_LDFLAGS="-pthread"
- PLATFORM_SOURCES="port/port_posix.cc util/env_posix.cc"
+ PORT_FILE=port/port_posix.cc
;;
SunOS)
PLATFORM=OS_SOLARIS
- COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS"
- PLATFORM_LDFLAGS="-lpthread -lrt"
- PLATFORM_SOURCES="port/port_posix.cc util/env_posix.cc"
+ COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
+ PLATFORM_LIBS="-lpthread -lrt"
+ PORT_FILE=port/port_posix.cc
;;
FreeBSD)
PLATFORM=OS_FREEBSD
- COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
- PLATFORM_LDFLAGS="-lpthread"
- PLATFORM_SOURCES="port/port_posix.cc util/env_posix.cc"
+ COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
+ PLATFORM_LIBS="-lpthread"
+ PORT_FILE=port/port_posix.cc
;;
NetBSD)
PLATFORM=OS_NETBSD
- COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
- PLATFORM_LDFLAGS="-lpthread -lgcc_s"
- PLATFORM_SOURCES="port/port_posix.cc util/env_posix.cc"
+ COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
+ PLATFORM_LIBS="-lpthread -lgcc_s"
+ PORT_FILE=port/port_posix.cc
;;
OpenBSD)
PLATFORM=OS_OPENBSD
- COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
+ COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
PLATFORM_LDFLAGS="-pthread"
- PLATFORM_SOURCES="port/port_posix.cc util/env_posix.cc"
+ PORT_FILE=port/port_posix.cc
;;
DragonFly)
PLATFORM=OS_DRAGONFLYBSD
- COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
- PLATFORM_LDFLAGS="-lpthread"
- PLATFORM_SOURCES="port/port_posix.cc util/env_posix.cc"
+ COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
+ PLATFORM_LIBS="-lpthread"
+ PORT_FILE=port/port_posix.cc
;;
OS_ANDROID_CROSSCOMPILE)
PLATFORM=OS_ANDROID
- COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
+ COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
PLATFORM_LDFLAGS="" # All pthread features are in the Android C library
- PLATFORM_SOURCES="port/port_posix.cc util/env_posix.cc"
+ PORT_FILE=port/port_posix.cc
CROSS_COMPILE=true
;;
- OS_WINDOWS_CROSSCOMPILE)
- PLATFORM=OS_WINDOWS
- COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_WINDOWS -DLEVELDB_PLATFORM_WINDOWS -DBOOST_THREAD_USE_LIB"
- PLATFORM_CXXFLAGS=""
- PLATFORM_LDFLAGS=""
- PLATFORM_SHARED_CFLAGS=""
- PLATFORM_SOURCES="port/port_win.cc util/env_boost.cc util/win_logger.cc"
- PLATFORM_EXTRALIBS="-lboost_system-mt-s -lboost_filesystem-mt-s -lboost_thread_win32-mt-s"
- CROSS_COMPILE=true
+ HP-UX)
+ PLATFORM=OS_HPUX
+ COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
+ PLATFORM_LDFLAGS="-pthread"
+ PORT_FILE=port/port_posix.cc
+ # man ld: +h internal_name
+ PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
;;
- NATIVE_WINDOWS)
- PLATFORM=OS_WINDOWS
- COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_WINDOWS -DLEVELDB_PLATFORM_WINDOWS -DBOOST_THREAD_USE_LIB"
- PLATFORM_CXXFLAGS=""
- PLATFORM_LDFLAGS=""
- PLATFORM_SHARED_CFLAGS=""
- PLATFORM_SOURCES="port/port_win.cc util/env_boost.cc util/win_logger.cc"
- PLATFORM_EXTRALIBS="-lboost_system-mgw45-mt-s-1_50 -lboost_filesystem-mgw45-mt-s-1_50 -lboost_thread-mgw45-mt-s-1_50 -lboost_chrono-mgw45-mt-s-1_50"
- CROSS_COMPILE=true
- ;;
*)
echo "Unknown platform!" >&2
exit 1
esac
# We want to make a list of all cc files within util, db, table, and helpers
# except for the test and benchmark files. By default, find will output a list
# of all files matching either rule, so we need to append -print to make the
# prune take effect.
-DIRS="util db table"
+DIRS="$PREFIX/db $PREFIX/util $PREFIX/table"
+
set -f # temporarily disable globbing so that our patterns aren't expanded
PRUNE_TEST="-name *test*.cc -prune"
PRUNE_BENCH="-name *_bench.cc -prune"
-PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -not -name 'env_*.cc' -not -name '*_logger.cc' -print | sort | tr "\n" " "`
+PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | sed "s,^$PREFIX/,," | tr "\n" " "`
+
set +f # re-enable globbing
# The sources consist of the portable files, plus the platform-specific port
# file.
-echo "SOURCES=$PORTABLE_FILES $PLATFORM_SOURCES" >> $OUTPUT
+echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
if [ "$CROSS_COMPILE" = "true" ]; then
# Cross-compiling; do not try any compilation tests.
true
else
# If -std=c++0x works, use <cstdatomic>. Otherwise use port_posix.h.
- $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null <<EOF
+ $CXX $CXXFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <cstdatomic>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT"
PLATFORM_CXXFLAGS="-std=c++0x"
else
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
fi
+ # Test whether Snappy library is installed
+ # http://code.google.com/p/snappy/
+ $CXX $CXXFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
+ #include <snappy.h>
+ int main() {}
+EOF
+ if [ "$?" = 0 ]; then
+ COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
+ PLATFORM_LIBS="$PLATFORM_LIBS -lsnappy"
+ fi
+
# Test whether tcmalloc is available
- $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <<EOF
+ $CXX $CXXFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <<EOF
int main() {}
EOF
if [ "$?" = 0 ]; then
- PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
+ PLATFORM_LIBS="$PLATFORM_LIBS -ltcmalloc"
fi
fi
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
echo "CC=$CC" >> $OUTPUT
echo "CXX=$CXX" >> $OUTPUT
echo "PLATFORM=$PLATFORM" >> $OUTPUT
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
+echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
-echo "PLATFORM_EXTRALIBS=$PLATFORM_EXTRALIBS" >> $OUTPUT
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
diff --git a/src/leveldb/db/c.cc b/src/leveldb/db/c.cc
index 2dde400e7..08ff0ad90 100644
--- a/src/leveldb/db/c.cc
+++ b/src/leveldb/db/c.cc
@@ -1,581 +1,595 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/c.h"
#include <stdlib.h>
#include <unistd.h>
#include "leveldb/cache.h"
#include "leveldb/comparator.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "leveldb/filter_policy.h"
#include "leveldb/iterator.h"
#include "leveldb/options.h"
#include "leveldb/status.h"
#include "leveldb/write_batch.h"
using leveldb::Cache;
using leveldb::Comparator;
using leveldb::CompressionType;
using leveldb::DB;
using leveldb::Env;
using leveldb::FileLock;
using leveldb::FilterPolicy;
using leveldb::Iterator;
+using leveldb::kMajorVersion;
+using leveldb::kMinorVersion;
using leveldb::Logger;
using leveldb::NewBloomFilterPolicy;
using leveldb::NewLRUCache;
using leveldb::Options;
using leveldb::RandomAccessFile;
using leveldb::Range;
using leveldb::ReadOptions;
using leveldb::SequentialFile;
using leveldb::Slice;
using leveldb::Snapshot;
using leveldb::Status;
using leveldb::WritableFile;
using leveldb::WriteBatch;
using leveldb::WriteOptions;
extern "C" {
struct leveldb_t { DB* rep; };
struct leveldb_iterator_t { Iterator* rep; };
struct leveldb_writebatch_t { WriteBatch rep; };
struct leveldb_snapshot_t { const Snapshot* rep; };
struct leveldb_readoptions_t { ReadOptions rep; };
struct leveldb_writeoptions_t { WriteOptions rep; };
struct leveldb_options_t { Options rep; };
struct leveldb_cache_t { Cache* rep; };
struct leveldb_seqfile_t { SequentialFile* rep; };
struct leveldb_randomfile_t { RandomAccessFile* rep; };
struct leveldb_writablefile_t { WritableFile* rep; };
struct leveldb_logger_t { Logger* rep; };
struct leveldb_filelock_t { FileLock* rep; };
struct leveldb_comparator_t : public Comparator {
void* state_;
void (*destructor_)(void*);
int (*compare_)(
void*,
const char* a, size_t alen,
const char* b, size_t blen);
const char* (*name_)(void*);
virtual ~leveldb_comparator_t() {
(*destructor_)(state_);
}
virtual int Compare(const Slice& a, const Slice& b) const {
return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
}
virtual const char* Name() const {
return (*name_)(state_);
}
// No-ops since the C binding does not support key shortening methods.
virtual void FindShortestSeparator(std::string*, const Slice&) const { }
virtual void FindShortSuccessor(std::string* key) const { }
};
struct leveldb_filterpolicy_t : public FilterPolicy {
void* state_;
void (*destructor_)(void*);
const char* (*name_)(void*);
char* (*create_)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length);
unsigned char (*key_match_)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length);
virtual ~leveldb_filterpolicy_t() {
(*destructor_)(state_);
}
virtual const char* Name() const {
return (*name_)(state_);
}
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
std::vector<const char*> key_pointers(n);
std::vector<size_t> key_sizes(n);
for (int i = 0; i < n; i++) {
key_pointers[i] = keys[i].data();
key_sizes[i] = keys[i].size();
}
size_t len;
char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
dst->append(filter, len);
free(filter);
}
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
return (*key_match_)(state_, key.data(), key.size(),
filter.data(), filter.size());
}
};
struct leveldb_env_t {
Env* rep;
bool is_default;
};
static bool SaveError(char** errptr, const Status& s) {
assert(errptr != NULL);
if (s.ok()) {
return false;
} else if (*errptr == NULL) {
*errptr = strdup(s.ToString().c_str());
} else {
// TODO(sanjay): Merge with existing error?
free(*errptr);
*errptr = strdup(s.ToString().c_str());
}
return true;
}
static char* CopyString(const std::string& str) {
char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
memcpy(result, str.data(), sizeof(char) * str.size());
return result;
}
leveldb_t* leveldb_open(
const leveldb_options_t* options,
const char* name,
char** errptr) {
DB* db;
if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
return NULL;
}
leveldb_t* result = new leveldb_t;
result->rep = db;
return result;
}
void leveldb_close(leveldb_t* db) {
delete db->rep;
delete db;
}
void leveldb_put(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr) {
SaveError(errptr,
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
}
void leveldb_delete(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
char** errptr) {
SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
}
void leveldb_write(
leveldb_t* db,
const leveldb_writeoptions_t* options,
leveldb_writebatch_t* batch,
char** errptr) {
SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
}
char* leveldb_get(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr) {
char* result = NULL;
std::string tmp;
Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
if (s.ok()) {
*vallen = tmp.size();
result = CopyString(tmp);
} else {
*vallen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
leveldb_iterator_t* leveldb_create_iterator(
leveldb_t* db,
const leveldb_readoptions_t* options) {
leveldb_iterator_t* result = new leveldb_iterator_t;
result->rep = db->rep->NewIterator(options->rep);
return result;
}
const leveldb_snapshot_t* leveldb_create_snapshot(
leveldb_t* db) {
leveldb_snapshot_t* result = new leveldb_snapshot_t;
result->rep = db->rep->GetSnapshot();
return result;
}
void leveldb_release_snapshot(
leveldb_t* db,
const leveldb_snapshot_t* snapshot) {
db->rep->ReleaseSnapshot(snapshot->rep);
delete snapshot;
}
char* leveldb_property_value(
leveldb_t* db,
const char* propname) {
std::string tmp;
if (db->rep->GetProperty(Slice(propname), &tmp)) {
// We use strdup() since we expect human readable output.
return strdup(tmp.c_str());
} else {
return NULL;
}
}
void leveldb_approximate_sizes(
leveldb_t* db,
int num_ranges,
const char* const* range_start_key, const size_t* range_start_key_len,
const char* const* range_limit_key, const size_t* range_limit_key_len,
uint64_t* sizes) {
Range* ranges = new Range[num_ranges];
for (int i = 0; i < num_ranges; i++) {
ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
}
db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
delete[] ranges;
}
void leveldb_compact_range(
leveldb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len) {
Slice a, b;
db->rep->CompactRange(
// Pass NULL Slice if corresponding "const char*" is NULL
(start_key ? (a = Slice(start_key, start_key_len), &a) : NULL),
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
}
void leveldb_destroy_db(
const leveldb_options_t* options,
const char* name,
char** errptr) {
SaveError(errptr, DestroyDB(name, options->rep));
}
void leveldb_repair_db(
const leveldb_options_t* options,
const char* name,
char** errptr) {
SaveError(errptr, RepairDB(name, options->rep));
}
void leveldb_iter_destroy(leveldb_iterator_t* iter) {
delete iter->rep;
delete iter;
}
unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) {
return iter->rep->Valid();
}
void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) {
iter->rep->SeekToFirst();
}
void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) {
iter->rep->SeekToLast();
}
void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) {
iter->rep->Seek(Slice(k, klen));
}
void leveldb_iter_next(leveldb_iterator_t* iter) {
iter->rep->Next();
}
void leveldb_iter_prev(leveldb_iterator_t* iter) {
iter->rep->Prev();
}
const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) {
Slice s = iter->rep->key();
*klen = s.size();
return s.data();
}
const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
Slice s = iter->rep->value();
*vlen = s.size();
return s.data();
}
void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
SaveError(errptr, iter->rep->status());
}
leveldb_writebatch_t* leveldb_writebatch_create() {
return new leveldb_writebatch_t;
}
void leveldb_writebatch_destroy(leveldb_writebatch_t* b) {
delete b;
}
void leveldb_writebatch_clear(leveldb_writebatch_t* b) {
b->rep.Clear();
}
void leveldb_writebatch_put(
leveldb_writebatch_t* b,
const char* key, size_t klen,
const char* val, size_t vlen) {
b->rep.Put(Slice(key, klen), Slice(val, vlen));
}
void leveldb_writebatch_delete(
leveldb_writebatch_t* b,
const char* key, size_t klen) {
b->rep.Delete(Slice(key, klen));
}
void leveldb_writebatch_iterate(
leveldb_writebatch_t* b,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen)) {
class H : public WriteBatch::Handler {
public:
void* state_;
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
void (*deleted_)(void*, const char* k, size_t klen);
virtual void Put(const Slice& key, const Slice& value) {
(*put_)(state_, key.data(), key.size(), value.data(), value.size());
}
virtual void Delete(const Slice& key) {
(*deleted_)(state_, key.data(), key.size());
}
};
H handler;
handler.state_ = state;
handler.put_ = put;
handler.deleted_ = deleted;
b->rep.Iterate(&handler);
}
leveldb_options_t* leveldb_options_create() {
return new leveldb_options_t;
}
void leveldb_options_destroy(leveldb_options_t* options) {
delete options;
}
void leveldb_options_set_comparator(
leveldb_options_t* opt,
leveldb_comparator_t* cmp) {
opt->rep.comparator = cmp;
}
void leveldb_options_set_filter_policy(
leveldb_options_t* opt,
leveldb_filterpolicy_t* policy) {
opt->rep.filter_policy = policy;
}
void leveldb_options_set_create_if_missing(
leveldb_options_t* opt, unsigned char v) {
opt->rep.create_if_missing = v;
}
void leveldb_options_set_error_if_exists(
leveldb_options_t* opt, unsigned char v) {
opt->rep.error_if_exists = v;
}
void leveldb_options_set_paranoid_checks(
leveldb_options_t* opt, unsigned char v) {
opt->rep.paranoid_checks = v;
}
void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
opt->rep.env = (env ? env->rep : NULL);
}
void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) {
opt->rep.info_log = (l ? l->rep : NULL);
}
void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) {
opt->rep.write_buffer_size = s;
}
void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) {
opt->rep.max_open_files = n;
}
void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) {
opt->rep.block_cache = c->rep;
}
void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) {
opt->rep.block_size = s;
}
void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) {
opt->rep.block_restart_interval = n;
}
void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
opt->rep.compression = static_cast<CompressionType>(t);
}
leveldb_comparator_t* leveldb_comparator_create(
void* state,
void (*destructor)(void*),
int (*compare)(
void*,
const char* a, size_t alen,
const char* b, size_t blen),
const char* (*name)(void*)) {
leveldb_comparator_t* result = new leveldb_comparator_t;
result->state_ = state;
result->destructor_ = destructor;
result->compare_ = compare;
result->name_ = name;
return result;
}
void leveldb_comparator_destroy(leveldb_comparator_t* cmp) {
delete cmp;
}
leveldb_filterpolicy_t* leveldb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length),
unsigned char (*key_may_match)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length),
const char* (*name)(void*)) {
leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
result->state_ = state;
result->destructor_ = destructor;
result->create_ = create_filter;
result->key_match_ = key_may_match;
result->name_ = name;
return result;
}
void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
delete filter;
}
leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
// Make a leveldb_filterpolicy_t, but override all of its methods so
// they delegate to a NewBloomFilterPolicy() instead of user
// supplied C functions.
struct Wrapper : public leveldb_filterpolicy_t {
const FilterPolicy* rep_;
~Wrapper() { delete rep_; }
const char* Name() const { return rep_->Name(); }
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
return rep_->CreateFilter(keys, n, dst);
}
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
return rep_->KeyMayMatch(key, filter);
}
static void DoNothing(void*) { }
};
Wrapper* wrapper = new Wrapper;
wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
wrapper->state_ = NULL;
wrapper->destructor_ = &Wrapper::DoNothing;
return wrapper;
}
leveldb_readoptions_t* leveldb_readoptions_create() {
return new leveldb_readoptions_t;
}
void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) {
delete opt;
}
void leveldb_readoptions_set_verify_checksums(
leveldb_readoptions_t* opt,
unsigned char v) {
opt->rep.verify_checksums = v;
}
void leveldb_readoptions_set_fill_cache(
leveldb_readoptions_t* opt, unsigned char v) {
opt->rep.fill_cache = v;
}
void leveldb_readoptions_set_snapshot(
leveldb_readoptions_t* opt,
const leveldb_snapshot_t* snap) {
opt->rep.snapshot = (snap ? snap->rep : NULL);
}
leveldb_writeoptions_t* leveldb_writeoptions_create() {
return new leveldb_writeoptions_t;
}
void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) {
delete opt;
}
void leveldb_writeoptions_set_sync(
leveldb_writeoptions_t* opt, unsigned char v) {
opt->rep.sync = v;
}
leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) {
leveldb_cache_t* c = new leveldb_cache_t;
c->rep = NewLRUCache(capacity);
return c;
}
void leveldb_cache_destroy(leveldb_cache_t* cache) {
delete cache->rep;
delete cache;
}
leveldb_env_t* leveldb_create_default_env() {
leveldb_env_t* result = new leveldb_env_t;
result->rep = Env::Default();
result->is_default = true;
return result;
}
void leveldb_env_destroy(leveldb_env_t* env) {
if (!env->is_default) delete env->rep;
delete env;
}
+void leveldb_free(void* ptr) {
+ free(ptr);
+}
+
+int leveldb_major_version() {
+ return kMajorVersion;
+}
+
+int leveldb_minor_version() {
+ return kMinorVersion;
+}
+
} // end extern "C"
diff --git a/src/leveldb/db/c_test.c b/src/leveldb/db/c_test.c
index 979244715..7cd5ee020 100644
--- a/src/leveldb/db/c_test.c
+++ b/src/leveldb/db/c_test.c
@@ -1,381 +1,390 @@
/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file. See the AUTHORS file for names of contributors. */
#include "leveldb/c.h"
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
const char* phase = "";
static char dbname[200];
static void StartPhase(const char* name) {
fprintf(stderr, "=== Test %s\n", name);
phase = name;
}
static const char* GetTempDir(void) {
const char* ret = getenv("TEST_TMPDIR");
if (ret == NULL || ret[0] == '\0')
ret = "/tmp";
return ret;
}
#define CheckNoError(err) \
if ((err) != NULL) { \
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
abort(); \
}
#define CheckCondition(cond) \
if (!(cond)) { \
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
abort(); \
}
static void CheckEqual(const char* expected, const char* v, size_t n) {
if (expected == NULL && v == NULL) {
// ok
} else if (expected != NULL && v != NULL && n == strlen(expected) &&
memcmp(expected, v, n) == 0) {
// ok
return;
} else {
fprintf(stderr, "%s: expected '%s', got '%s'\n",
phase,
(expected ? expected : "(null)"),
(v ? v : "(null"));
abort();
}
}
static void Free(char** ptr) {
if (*ptr) {
free(*ptr);
*ptr = NULL;
}
}
static void CheckGet(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key,
const char* expected) {
char* err = NULL;
size_t val_len;
char* val;
val = leveldb_get(db, options, key, strlen(key), &val_len, &err);
CheckNoError(err);
CheckEqual(expected, val, val_len);
Free(&val);
}
static void CheckIter(leveldb_iterator_t* iter,
const char* key, const char* val) {
size_t len;
const char* str;
str = leveldb_iter_key(iter, &len);
CheckEqual(key, str, len);
str = leveldb_iter_value(iter, &len);
CheckEqual(val, str, len);
}
// Callback from leveldb_writebatch_iterate()
static void CheckPut(void* ptr,
const char* k, size_t klen,
const char* v, size_t vlen) {
int* state = (int*) ptr;
CheckCondition(*state < 2);
switch (*state) {
case 0:
CheckEqual("bar", k, klen);
CheckEqual("b", v, vlen);
break;
case 1:
CheckEqual("box", k, klen);
CheckEqual("c", v, vlen);
break;
}
(*state)++;
}
// Callback from leveldb_writebatch_iterate()
static void CheckDel(void* ptr, const char* k, size_t klen) {
int* state = (int*) ptr;
CheckCondition(*state == 2);
CheckEqual("bar", k, klen);
(*state)++;
}
static void CmpDestroy(void* arg) { }
static int CmpCompare(void* arg, const char* a, size_t alen,
const char* b, size_t blen) {
int n = (alen < blen) ? alen : blen;
int r = memcmp(a, b, n);
if (r == 0) {
if (alen < blen) r = -1;
else if (alen > blen) r = +1;
}
return r;
}
static const char* CmpName(void* arg) {
return "foo";
}
// Custom filter policy
static unsigned char fake_filter_result = 1;
static void FilterDestroy(void* arg) { }
static const char* FilterName(void* arg) {
return "TestFilter";
}
static char* FilterCreate(
void* arg,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length) {
*filter_length = 4;
char* result = malloc(4);
memcpy(result, "fake", 4);
return result;
}
unsigned char FilterKeyMatch(
void* arg,
const char* key, size_t length,
const char* filter, size_t filter_length) {
CheckCondition(filter_length == 4);
CheckCondition(memcmp(filter, "fake", 4) == 0);
return fake_filter_result;
}
int main(int argc, char** argv) {
leveldb_t* db;
leveldb_comparator_t* cmp;
leveldb_cache_t* cache;
leveldb_env_t* env;
leveldb_options_t* options;
leveldb_readoptions_t* roptions;
leveldb_writeoptions_t* woptions;
char* err = NULL;
int run = -1;
+ CheckCondition(leveldb_major_version() >= 1);
+ CheckCondition(leveldb_minor_version() >= 1);
+
snprintf(dbname, sizeof(dbname),
"%s/leveldb_c_test-%d",
GetTempDir(),
((int) geteuid()));
StartPhase("create_objects");
cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
env = leveldb_create_default_env();
cache = leveldb_cache_create_lru(100000);
options = leveldb_options_create();
leveldb_options_set_comparator(options, cmp);
leveldb_options_set_error_if_exists(options, 1);
leveldb_options_set_cache(options, cache);
leveldb_options_set_env(options, env);
leveldb_options_set_info_log(options, NULL);
leveldb_options_set_write_buffer_size(options, 100000);
leveldb_options_set_paranoid_checks(options, 1);
leveldb_options_set_max_open_files(options, 10);
leveldb_options_set_block_size(options, 1024);
leveldb_options_set_block_restart_interval(options, 8);
leveldb_options_set_compression(options, leveldb_no_compression);
roptions = leveldb_readoptions_create();
leveldb_readoptions_set_verify_checksums(roptions, 1);
leveldb_readoptions_set_fill_cache(roptions, 0);
woptions = leveldb_writeoptions_create();
leveldb_writeoptions_set_sync(woptions, 1);
StartPhase("destroy");
leveldb_destroy_db(options, dbname, &err);
Free(&err);
StartPhase("open_error");
db = leveldb_open(options, dbname, &err);
CheckCondition(err != NULL);
Free(&err);
+ StartPhase("leveldb_free");
+ db = leveldb_open(options, dbname, &err);
+ CheckCondition(err != NULL);
+ leveldb_free(err);
+ err = NULL;
+
StartPhase("open");
leveldb_options_set_create_if_missing(options, 1);
db = leveldb_open(options, dbname, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", NULL);
StartPhase("put");
leveldb_put(db, woptions, "foo", 3, "hello", 5, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "hello");
StartPhase("compactall");
leveldb_compact_range(db, NULL, 0, NULL, 0);
CheckGet(db, roptions, "foo", "hello");
StartPhase("compactrange");
leveldb_compact_range(db, "a", 1, "z", 1);
CheckGet(db, roptions, "foo", "hello");
StartPhase("writebatch");
{
leveldb_writebatch_t* wb = leveldb_writebatch_create();
leveldb_writebatch_put(wb, "foo", 3, "a", 1);
leveldb_writebatch_clear(wb);
leveldb_writebatch_put(wb, "bar", 3, "b", 1);
leveldb_writebatch_put(wb, "box", 3, "c", 1);
leveldb_writebatch_delete(wb, "bar", 3);
leveldb_write(db, woptions, wb, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "hello");
CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c");
int pos = 0;
leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
CheckCondition(pos == 3);
leveldb_writebatch_destroy(wb);
}
StartPhase("iter");
{
leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
CheckCondition(!leveldb_iter_valid(iter));
leveldb_iter_seek_to_first(iter);
CheckCondition(leveldb_iter_valid(iter));
CheckIter(iter, "box", "c");
leveldb_iter_next(iter);
CheckIter(iter, "foo", "hello");
leveldb_iter_prev(iter);
CheckIter(iter, "box", "c");
leveldb_iter_prev(iter);
CheckCondition(!leveldb_iter_valid(iter));
leveldb_iter_seek_to_last(iter);
CheckIter(iter, "foo", "hello");
leveldb_iter_seek(iter, "b", 1);
CheckIter(iter, "box", "c");
leveldb_iter_get_error(iter, &err);
CheckNoError(err);
leveldb_iter_destroy(iter);
}
StartPhase("approximate_sizes");
{
int i;
int n = 20000;
char keybuf[100];
char valbuf[100];
uint64_t sizes[2];
const char* start[2] = { "a", "k00000000000000010000" };
size_t start_len[2] = { 1, 21 };
const char* limit[2] = { "k00000000000000010000", "z" };
size_t limit_len[2] = { 21, 1 };
leveldb_writeoptions_set_sync(woptions, 0);
for (i = 0; i < n; i++) {
snprintf(keybuf, sizeof(keybuf), "k%020d", i);
snprintf(valbuf, sizeof(valbuf), "v%020d", i);
leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
&err);
CheckNoError(err);
}
leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
CheckCondition(sizes[0] > 0);
CheckCondition(sizes[1] > 0);
}
StartPhase("property");
{
char* prop = leveldb_property_value(db, "nosuchprop");
CheckCondition(prop == NULL);
prop = leveldb_property_value(db, "leveldb.stats");
CheckCondition(prop != NULL);
Free(&prop);
}
StartPhase("snapshot");
{
const leveldb_snapshot_t* snap;
snap = leveldb_create_snapshot(db);
leveldb_delete(db, woptions, "foo", 3, &err);
CheckNoError(err);
leveldb_readoptions_set_snapshot(roptions, snap);
CheckGet(db, roptions, "foo", "hello");
leveldb_readoptions_set_snapshot(roptions, NULL);
CheckGet(db, roptions, "foo", NULL);
leveldb_release_snapshot(db, snap);
}
StartPhase("repair");
{
leveldb_close(db);
leveldb_options_set_create_if_missing(options, 0);
leveldb_options_set_error_if_exists(options, 0);
leveldb_repair_db(options, dbname, &err);
CheckNoError(err);
db = leveldb_open(options, dbname, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", NULL);
CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c");
leveldb_options_set_create_if_missing(options, 1);
leveldb_options_set_error_if_exists(options, 1);
}
StartPhase("filter");
for (run = 0; run < 2; run++) {
// First run uses custom filter, second run uses bloom filter
CheckNoError(err);
leveldb_filterpolicy_t* policy;
if (run == 0) {
policy = leveldb_filterpolicy_create(
NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
} else {
policy = leveldb_filterpolicy_create_bloom(10);
}
// Create new database
leveldb_close(db);
leveldb_destroy_db(options, dbname, &err);
leveldb_options_set_filter_policy(options, policy);
db = leveldb_open(options, dbname, &err);
CheckNoError(err);
leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
CheckNoError(err);
leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
CheckNoError(err);
leveldb_compact_range(db, NULL, 0, NULL, 0);
fake_filter_result = 1;
CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue");
if (phase == 0) {
// Must not find value when custom filter returns false
fake_filter_result = 0;
CheckGet(db, roptions, "foo", NULL);
CheckGet(db, roptions, "bar", NULL);
fake_filter_result = 1;
CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue");
}
leveldb_options_set_filter_policy(options, NULL);
leveldb_filterpolicy_destroy(policy);
}
StartPhase("cleanup");
leveldb_close(db);
leveldb_options_destroy(options);
leveldb_readoptions_destroy(roptions);
leveldb_writeoptions_destroy(woptions);
leveldb_cache_destroy(cache);
leveldb_comparator_destroy(cmp);
leveldb_env_destroy(env);
fprintf(stderr, "PASS\n");
return 0;
}
diff --git a/src/leveldb/db/db_bench.cc b/src/leveldb/db/db_bench.cc
index 21d3e25f3..7abdf8758 100644
--- a/src/leveldb/db/db_bench.cc
+++ b/src/leveldb/db/db_bench.cc
@@ -1,978 +1,979 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include "db/db_impl.h"
#include "db/version_set.h"
#include "leveldb/cache.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "leveldb/write_batch.h"
#include "port/port.h"
#include "util/crc32c.h"
#include "util/histogram.h"
#include "util/mutexlock.h"
#include "util/random.h"
#include "util/testutil.h"
// Comma-separated list of operations to run in the specified order
// Actual benchmarks:
// fillseq -- write N values in sequential key order in async mode
// fillrandom -- write N values in random key order in async mode
// overwrite -- overwrite N values in random key order in async mode
// fillsync -- write N/100 values in random key order in sync mode
// fill100K -- write N/1000 100K values in random order in async mode
// deleteseq -- delete N keys in sequential order
// deleterandom -- delete N keys in random order
// readseq -- read N times sequentially
// readreverse -- read N times in reverse order
// readrandom -- read N times in random order
// readmissing -- read N missing keys in random order
// readhot -- read N times in random order from 1% section of DB
// seekrandom -- N random seeks
// crc32c -- repeated crc32c of 4K of data
// acquireload -- load N*1000 times
// Meta operations:
// compact -- Compact the entire DB
// stats -- Print DB stats
// sstables -- Print sstable info
// heapprofile -- Dump a heap profile (if supported by this port)
static const char* FLAGS_benchmarks =
"fillseq,"
"fillsync,"
"fillrandom,"
"overwrite,"
"readrandom,"
"readrandom," // Extra run to allow previous compactions to quiesce
"readseq,"
"readreverse,"
"compact,"
"readrandom,"
"readseq,"
"readreverse,"
"fill100K,"
"crc32c,"
"snappycomp,"
"snappyuncomp,"
"acquireload,"
;
// Number of key/values to place in database
static int FLAGS_num = 1000000;
// Number of read operations to do. If negative, do FLAGS_num reads.
static int FLAGS_reads = -1;
// Number of concurrent threads to run.
static int FLAGS_threads = 1;
// Size of each value
static int FLAGS_value_size = 100;
// Arrange to generate values that shrink to this fraction of
// their original size after compression
static double FLAGS_compression_ratio = 0.5;
// Print histogram of operation timings
static bool FLAGS_histogram = false;
// Number of bytes to buffer in memtable before compacting
// (initialized to default value by "main")
static int FLAGS_write_buffer_size = 0;
// Number of bytes to use as a cache of uncompressed data.
// Negative means use default settings.
static int FLAGS_cache_size = -1;
// Maximum number of files to keep open at the same time (use default if == 0)
static int FLAGS_open_files = 0;
// Bloom filter bits per key.
// Negative means use default settings.
static int FLAGS_bloom_bits = -1;
// If true, do not destroy the existing database. If you set this
// flag and also specify a benchmark that wants a fresh database, that
// benchmark will fail.
static bool FLAGS_use_existing_db = false;
// Use the db with the following name.
static const char* FLAGS_db = NULL;
namespace leveldb {
namespace {
// Helper for quickly generating random data.
class RandomGenerator {
private:
std::string data_;
int pos_;
public:
RandomGenerator() {
// We use a limited amount of data over and over again and ensure
// that it is larger than the compression window (32KB), and also
// large enough to serve all typical value sizes we want to write.
Random rnd(301);
std::string piece;
while (data_.size() < 1048576) {
// Add a short fragment that is as compressible as specified
// by FLAGS_compression_ratio.
test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
data_.append(piece);
}
pos_ = 0;
}
Slice Generate(int len) {
if (pos_ + len > data_.size()) {
pos_ = 0;
assert(len < data_.size());
}
pos_ += len;
return Slice(data_.data() + pos_ - len, len);
}
};
static Slice TrimSpace(Slice s) {
int start = 0;
while (start < s.size() && isspace(s[start])) {
start++;
}
int limit = s.size();
while (limit > start && isspace(s[limit-1])) {
limit--;
}
return Slice(s.data() + start, limit - start);
}
static void AppendWithSpace(std::string* str, Slice msg) {
if (msg.empty()) return;
if (!str->empty()) {
str->push_back(' ');
}
str->append(msg.data(), msg.size());
}
class Stats {
private:
double start_;
double finish_;
double seconds_;
int done_;
int next_report_;
int64_t bytes_;
double last_op_finish_;
Histogram hist_;
std::string message_;
public:
Stats() { Start(); }
void Start() {
next_report_ = 100;
last_op_finish_ = start_;
hist_.Clear();
done_ = 0;
bytes_ = 0;
seconds_ = 0;
start_ = Env::Default()->NowMicros();
finish_ = start_;
message_.clear();
}
void Merge(const Stats& other) {
hist_.Merge(other.hist_);
done_ += other.done_;
bytes_ += other.bytes_;
seconds_ += other.seconds_;
if (other.start_ < start_) start_ = other.start_;
if (other.finish_ > finish_) finish_ = other.finish_;
// Just keep the messages from one thread
if (message_.empty()) message_ = other.message_;
}
void Stop() {
finish_ = Env::Default()->NowMicros();
seconds_ = (finish_ - start_) * 1e-6;
}
void AddMessage(Slice msg) {
AppendWithSpace(&message_, msg);
}
void FinishedSingleOp() {
if (FLAGS_histogram) {
double now = Env::Default()->NowMicros();
double micros = now - last_op_finish_;
hist_.Add(micros);
if (micros > 20000) {
fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
fflush(stderr);
}
last_op_finish_ = now;
}
done_++;
if (done_ >= next_report_) {
if (next_report_ < 1000) next_report_ += 100;
else if (next_report_ < 5000) next_report_ += 500;
else if (next_report_ < 10000) next_report_ += 1000;
else if (next_report_ < 50000) next_report_ += 5000;
else if (next_report_ < 100000) next_report_ += 10000;
else if (next_report_ < 500000) next_report_ += 50000;
else next_report_ += 100000;
fprintf(stderr, "... finished %d ops%30s\r", done_, "");
fflush(stderr);
}
}
void AddBytes(int64_t n) {
bytes_ += n;
}
void Report(const Slice& name) {
// Pretend at least one op was done in case we are running a benchmark
// that does not call FinishedSingleOp().
if (done_ < 1) done_ = 1;
std::string extra;
if (bytes_ > 0) {
// Rate is computed on actual elapsed time, not the sum of per-thread
// elapsed times.
double elapsed = (finish_ - start_) * 1e-6;
char rate[100];
snprintf(rate, sizeof(rate), "%6.1f MB/s",
(bytes_ / 1048576.0) / elapsed);
extra = rate;
}
AppendWithSpace(&extra, message_);
fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
name.ToString().c_str(),
seconds_ * 1e6 / done_,
(extra.empty() ? "" : " "),
extra.c_str());
if (FLAGS_histogram) {
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
}
fflush(stdout);
}
};
// State shared by all concurrent executions of the same benchmark.
struct SharedState {
port::Mutex mu;
port::CondVar cv;
int total;
// Each thread goes through the following states:
// (1) initializing
// (2) waiting for others to be initialized
// (3) running
// (4) done
int num_initialized;
int num_done;
bool start;
SharedState() : cv(&mu) { }
};
// Per-thread state for concurrent executions of the same benchmark.
struct ThreadState {
int tid; // 0..n-1 when running in n threads
Random rand; // Has different seeds for different threads
Stats stats;
SharedState* shared;
ThreadState(int index)
: tid(index),
rand(1000 + index) {
}
};
} // namespace
class Benchmark {
private:
Cache* cache_;
const FilterPolicy* filter_policy_;
DB* db_;
int num_;
int value_size_;
int entries_per_batch_;
WriteOptions write_options_;
int reads_;
int heap_counter_;
void PrintHeader() {
const int kKeySize = 16;
PrintEnvironment();
fprintf(stdout, "Keys: %d bytes each\n", kKeySize);
fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n",
FLAGS_value_size,
static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
fprintf(stdout, "Entries: %d\n", num_);
fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
((static_cast<int64_t>(kKeySize + FLAGS_value_size) * num_)
/ 1048576.0));
fprintf(stdout, "FileSize: %.1f MB (estimated)\n",
(((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
/ 1048576.0));
PrintWarnings();
fprintf(stdout, "------------------------------------------------\n");
}
void PrintWarnings() {
#if defined(__GNUC__) && !defined(__OPTIMIZE__)
fprintf(stdout,
"WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
);
#endif
#ifndef NDEBUG
fprintf(stdout,
"WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
// See if snappy is working by attempting to compress a compressible string
const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy";
std::string compressed;
if (!port::Snappy_Compress(text, sizeof(text), &compressed)) {
fprintf(stdout, "WARNING: Snappy compression is not enabled\n");
} else if (compressed.size() >= sizeof(text)) {
fprintf(stdout, "WARNING: Snappy compression is not effective\n");
}
}
void PrintEnvironment() {
fprintf(stderr, "LevelDB: version %d.%d\n",
kMajorVersion, kMinorVersion);
#if defined(__linux)
time_t now = time(NULL);
fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline
FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
if (cpuinfo != NULL) {
char line[1000];
int num_cpus = 0;
std::string cpu_type;
std::string cache_size;
while (fgets(line, sizeof(line), cpuinfo) != NULL) {
const char* sep = strchr(line, ':');
if (sep == NULL) {
continue;
}
Slice key = TrimSpace(Slice(line, sep - 1 - line));
Slice val = TrimSpace(Slice(sep + 1));
if (key == "model name") {
++num_cpus;
cpu_type = val.ToString();
} else if (key == "cache size") {
cache_size = val.ToString();
}
}
fclose(cpuinfo);
fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
}
#endif
}
public:
Benchmark()
: cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
filter_policy_(FLAGS_bloom_bits >= 0
? NewBloomFilterPolicy(FLAGS_bloom_bits)
: NULL),
db_(NULL),
num_(FLAGS_num),
value_size_(FLAGS_value_size),
entries_per_batch_(1),
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
heap_counter_(0) {
std::vector<std::string> files;
Env::Default()->GetChildren(FLAGS_db, &files);
for (int i = 0; i < files.size(); i++) {
if (Slice(files[i]).starts_with("heap-")) {
Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
}
}
if (!FLAGS_use_existing_db) {
DestroyDB(FLAGS_db, Options());
}
}
~Benchmark() {
delete db_;
delete cache_;
delete filter_policy_;
}
void Run() {
PrintHeader();
Open();
const char* benchmarks = FLAGS_benchmarks;
while (benchmarks != NULL) {
const char* sep = strchr(benchmarks, ',');
Slice name;
if (sep == NULL) {
name = benchmarks;
benchmarks = NULL;
} else {
name = Slice(benchmarks, sep - benchmarks);
benchmarks = sep + 1;
}
// Reset parameters that may be overriddden bwlow
num_ = FLAGS_num;
reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
value_size_ = FLAGS_value_size;
entries_per_batch_ = 1;
write_options_ = WriteOptions();
void (Benchmark::*method)(ThreadState*) = NULL;
bool fresh_db = false;
int num_threads = FLAGS_threads;
if (name == Slice("fillseq")) {
fresh_db = true;
method = &Benchmark::WriteSeq;
} else if (name == Slice("fillbatch")) {
fresh_db = true;
entries_per_batch_ = 1000;
method = &Benchmark::WriteSeq;
} else if (name == Slice("fillrandom")) {
fresh_db = true;
method = &Benchmark::WriteRandom;
} else if (name == Slice("overwrite")) {
fresh_db = false;
method = &Benchmark::WriteRandom;
} else if (name == Slice("fillsync")) {
fresh_db = true;
num_ /= 1000;
write_options_.sync = true;
method = &Benchmark::WriteRandom;
} else if (name == Slice("fill100K")) {
fresh_db = true;
num_ /= 1000;
value_size_ = 100 * 1000;
method = &Benchmark::WriteRandom;
} else if (name == Slice("readseq")) {
method = &Benchmark::ReadSequential;
} else if (name == Slice("readreverse")) {
method = &Benchmark::ReadReverse;
} else if (name == Slice("readrandom")) {
method = &Benchmark::ReadRandom;
} else if (name == Slice("readmissing")) {
method = &Benchmark::ReadMissing;
} else if (name == Slice("seekrandom")) {
method = &Benchmark::SeekRandom;
} else if (name == Slice("readhot")) {
method = &Benchmark::ReadHot;
} else if (name == Slice("readrandomsmall")) {
reads_ /= 1000;
method = &Benchmark::ReadRandom;
} else if (name == Slice("deleteseq")) {
method = &Benchmark::DeleteSeq;
} else if (name == Slice("deleterandom")) {
method = &Benchmark::DeleteRandom;
} else if (name == Slice("readwhilewriting")) {
num_threads++; // Add extra thread for writing
method = &Benchmark::ReadWhileWriting;
} else if (name == Slice("compact")) {
method = &Benchmark::Compact;
} else if (name == Slice("crc32c")) {
method = &Benchmark::Crc32c;
} else if (name == Slice("acquireload")) {
method = &Benchmark::AcquireLoad;
} else if (name == Slice("snappycomp")) {
method = &Benchmark::SnappyCompress;
} else if (name == Slice("snappyuncomp")) {
method = &Benchmark::SnappyUncompress;
} else if (name == Slice("heapprofile")) {
HeapProfile();
} else if (name == Slice("stats")) {
PrintStats("leveldb.stats");
} else if (name == Slice("sstables")) {
PrintStats("leveldb.sstables");
} else {
if (name != Slice()) { // No error message for empty name
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
}
}
if (fresh_db) {
if (FLAGS_use_existing_db) {
fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
name.ToString().c_str());
method = NULL;
} else {
delete db_;
db_ = NULL;
DestroyDB(FLAGS_db, Options());
Open();
}
}
if (method != NULL) {
RunBenchmark(num_threads, name, method);
}
}
}
private:
struct ThreadArg {
Benchmark* bm;
SharedState* shared;
ThreadState* thread;
void (Benchmark::*method)(ThreadState*);
};
static void ThreadBody(void* v) {
ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
SharedState* shared = arg->shared;
ThreadState* thread = arg->thread;
{
MutexLock l(&shared->mu);
shared->num_initialized++;
if (shared->num_initialized >= shared->total) {
shared->cv.SignalAll();
}
while (!shared->start) {
shared->cv.Wait();
}
}
thread->stats.Start();
(arg->bm->*(arg->method))(thread);
thread->stats.Stop();
{
MutexLock l(&shared->mu);
shared->num_done++;
if (shared->num_done >= shared->total) {
shared->cv.SignalAll();
}
}
}
void RunBenchmark(int n, Slice name,
void (Benchmark::*method)(ThreadState*)) {
SharedState shared;
shared.total = n;
shared.num_initialized = 0;
shared.num_done = 0;
shared.start = false;
ThreadArg* arg = new ThreadArg[n];
for (int i = 0; i < n; i++) {
arg[i].bm = this;
arg[i].method = method;
arg[i].shared = &shared;
arg[i].thread = new ThreadState(i);
arg[i].thread->shared = &shared;
Env::Default()->StartThread(ThreadBody, &arg[i]);
}
shared.mu.Lock();
while (shared.num_initialized < n) {
shared.cv.Wait();
}
shared.start = true;
shared.cv.SignalAll();
while (shared.num_done < n) {
shared.cv.Wait();
}
shared.mu.Unlock();
for (int i = 1; i < n; i++) {
arg[0].thread->stats.Merge(arg[i].thread->stats);
}
arg[0].thread->stats.Report(name);
for (int i = 0; i < n; i++) {
delete arg[i].thread;
}
delete[] arg;
}
void Crc32c(ThreadState* thread) {
// Checksum about 500MB of data total
const int size = 4096;
const char* label = "(4K per op)";
std::string data(size, 'x');
int64_t bytes = 0;
uint32_t crc = 0;
while (bytes < 500 * 1048576) {
crc = crc32c::Value(data.data(), size);
thread->stats.FinishedSingleOp();
bytes += size;
}
// Print so result is not dead
fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
thread->stats.AddBytes(bytes);
thread->stats.AddMessage(label);
}
void AcquireLoad(ThreadState* thread) {
int dummy;
port::AtomicPointer ap(&dummy);
int count = 0;
void *ptr = NULL;
thread->stats.AddMessage("(each op is 1000 loads)");
while (count < 100000) {
for (int i = 0; i < 1000; i++) {
ptr = ap.Acquire_Load();
}
count++;
thread->stats.FinishedSingleOp();
}
if (ptr == NULL) exit(1); // Disable unused variable warning.
}
void SnappyCompress(ThreadState* thread) {
RandomGenerator gen;
Slice input = gen.Generate(Options().block_size);
int64_t bytes = 0;
int64_t produced = 0;
bool ok = true;
std::string compressed;
while (ok && bytes < 1024 * 1048576) { // Compress 1G
ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
produced += compressed.size();
bytes += input.size();
thread->stats.FinishedSingleOp();
}
if (!ok) {
thread->stats.AddMessage("(snappy failure)");
} else {
char buf[100];
snprintf(buf, sizeof(buf), "(output: %.1f%%)",
(produced * 100.0) / bytes);
thread->stats.AddMessage(buf);
thread->stats.AddBytes(bytes);
}
}
void SnappyUncompress(ThreadState* thread) {
RandomGenerator gen;
Slice input = gen.Generate(Options().block_size);
std::string compressed;
bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
int64_t bytes = 0;
char* uncompressed = new char[input.size()];
while (ok && bytes < 1024 * 1048576) { // Compress 1G
ok = port::Snappy_Uncompress(compressed.data(), compressed.size(),
uncompressed);
bytes += input.size();
thread->stats.FinishedSingleOp();
}
delete[] uncompressed;
if (!ok) {
thread->stats.AddMessage("(snappy failure)");
} else {
thread->stats.AddBytes(bytes);
}
}
void Open() {
assert(db_ == NULL);
Options options;
options.create_if_missing = !FLAGS_use_existing_db;
options.block_cache = cache_;
options.write_buffer_size = FLAGS_write_buffer_size;
+ options.max_open_files = FLAGS_open_files;
options.filter_policy = filter_policy_;
Status s = DB::Open(options, FLAGS_db, &db_);
if (!s.ok()) {
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
exit(1);
}
}
void WriteSeq(ThreadState* thread) {
DoWrite(thread, true);
}
void WriteRandom(ThreadState* thread) {
DoWrite(thread, false);
}
void DoWrite(ThreadState* thread, bool seq) {
if (num_ != FLAGS_num) {
char msg[100];
snprintf(msg, sizeof(msg), "(%d ops)", num_);
thread->stats.AddMessage(msg);
}
RandomGenerator gen;
WriteBatch batch;
Status s;
int64_t bytes = 0;
for (int i = 0; i < num_; i += entries_per_batch_) {
batch.Clear();
for (int j = 0; j < entries_per_batch_; j++) {
const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
char key[100];
snprintf(key, sizeof(key), "%016d", k);
batch.Put(key, gen.Generate(value_size_));
bytes += value_size_ + strlen(key);
thread->stats.FinishedSingleOp();
}
s = db_->Write(write_options_, &batch);
if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1);
}
}
thread->stats.AddBytes(bytes);
}
void ReadSequential(ThreadState* thread) {
Iterator* iter = db_->NewIterator(ReadOptions());
int i = 0;
int64_t bytes = 0;
for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
bytes += iter->key().size() + iter->value().size();
thread->stats.FinishedSingleOp();
++i;
}
delete iter;
thread->stats.AddBytes(bytes);
}
void ReadReverse(ThreadState* thread) {
Iterator* iter = db_->NewIterator(ReadOptions());
int i = 0;
int64_t bytes = 0;
for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
bytes += iter->key().size() + iter->value().size();
thread->stats.FinishedSingleOp();
++i;
}
delete iter;
thread->stats.AddBytes(bytes);
}
void ReadRandom(ThreadState* thread) {
ReadOptions options;
std::string value;
int found = 0;
for (int i = 0; i < reads_; i++) {
char key[100];
const int k = thread->rand.Next() % FLAGS_num;
snprintf(key, sizeof(key), "%016d", k);
if (db_->Get(options, key, &value).ok()) {
found++;
}
thread->stats.FinishedSingleOp();
}
char msg[100];
snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
thread->stats.AddMessage(msg);
}
void ReadMissing(ThreadState* thread) {
ReadOptions options;
std::string value;
for (int i = 0; i < reads_; i++) {
char key[100];
const int k = thread->rand.Next() % FLAGS_num;
snprintf(key, sizeof(key), "%016d.", k);
db_->Get(options, key, &value);
thread->stats.FinishedSingleOp();
}
}
void ReadHot(ThreadState* thread) {
ReadOptions options;
std::string value;
const int range = (FLAGS_num + 99) / 100;
for (int i = 0; i < reads_; i++) {
char key[100];
const int k = thread->rand.Next() % range;
snprintf(key, sizeof(key), "%016d", k);
db_->Get(options, key, &value);
thread->stats.FinishedSingleOp();
}
}
void SeekRandom(ThreadState* thread) {
ReadOptions options;
std::string value;
int found = 0;
for (int i = 0; i < reads_; i++) {
Iterator* iter = db_->NewIterator(options);
char key[100];
const int k = thread->rand.Next() % FLAGS_num;
snprintf(key, sizeof(key), "%016d", k);
iter->Seek(key);
if (iter->Valid() && iter->key() == key) found++;
delete iter;
thread->stats.FinishedSingleOp();
}
char msg[100];
snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
thread->stats.AddMessage(msg);
}
void DoDelete(ThreadState* thread, bool seq) {
RandomGenerator gen;
WriteBatch batch;
Status s;
for (int i = 0; i < num_; i += entries_per_batch_) {
batch.Clear();
for (int j = 0; j < entries_per_batch_; j++) {
const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
char key[100];
snprintf(key, sizeof(key), "%016d", k);
batch.Delete(key);
thread->stats.FinishedSingleOp();
}
s = db_->Write(write_options_, &batch);
if (!s.ok()) {
fprintf(stderr, "del error: %s\n", s.ToString().c_str());
exit(1);
}
}
}
void DeleteSeq(ThreadState* thread) {
DoDelete(thread, true);
}
void DeleteRandom(ThreadState* thread) {
DoDelete(thread, false);
}
void ReadWhileWriting(ThreadState* thread) {
if (thread->tid > 0) {
ReadRandom(thread);
} else {
// Special thread that keeps writing until other threads are done.
RandomGenerator gen;
while (true) {
{
MutexLock l(&thread->shared->mu);
if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
// Other threads have finished
break;
}
}
const int k = thread->rand.Next() % FLAGS_num;
char key[100];
snprintf(key, sizeof(key), "%016d", k);
Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1);
}
}
// Do not count any of the preceding work/delay in stats.
thread->stats.Start();
}
}
void Compact(ThreadState* thread) {
db_->CompactRange(NULL, NULL);
}
void PrintStats(const char* key) {
std::string stats;
if (!db_->GetProperty(key, &stats)) {
stats = "(failed)";
}
fprintf(stdout, "\n%s\n", stats.c_str());
}
static void WriteToFile(void* arg, const char* buf, int n) {
reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
}
void HeapProfile() {
char fname[100];
snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
WritableFile* file;
Status s = Env::Default()->NewWritableFile(fname, &file);
if (!s.ok()) {
fprintf(stderr, "%s\n", s.ToString().c_str());
return;
}
bool ok = port::GetHeapProfile(WriteToFile, file);
delete file;
if (!ok) {
fprintf(stderr, "heap profiling not supported\n");
Env::Default()->DeleteFile(fname);
}
}
};
} // namespace leveldb
int main(int argc, char** argv) {
FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
FLAGS_open_files = leveldb::Options().max_open_files;
std::string default_db_path;
for (int i = 1; i < argc; i++) {
double d;
int n;
char junk;
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
} else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
FLAGS_compression_ratio = d;
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_histogram = n;
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_existing_db = n;
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
FLAGS_num = n;
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
FLAGS_reads = n;
} else if (sscanf(argv[i], "--threads=%d%c", &n, &junk) == 1) {
FLAGS_threads = n;
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
FLAGS_value_size = n;
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
FLAGS_write_buffer_size = n;
} else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
FLAGS_cache_size = n;
} else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
FLAGS_bloom_bits = n;
} else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
FLAGS_open_files = n;
} else if (strncmp(argv[i], "--db=", 5) == 0) {
FLAGS_db = argv[i] + 5;
} else {
fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
exit(1);
}
}
// Choose a location for the test database if none given with --db=<path>
if (FLAGS_db == NULL) {
leveldb::Env::Default()->GetTestDirectory(&default_db_path);
default_db_path += "/dbbench";
FLAGS_db = default_db_path.c_str();
}
leveldb::Benchmark benchmark;
benchmark.Run();
return 0;
}
diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc
index 90c1c811d..c9de169f2 100644
--- a/src/leveldb/db/db_impl.cc
+++ b/src/leveldb/db/db_impl.cc
@@ -1,1463 +1,1467 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_impl.h"
#include <algorithm>
#include <set>
#include <string>
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include "db/builder.h"
#include "db/db_iter.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "leveldb/status.h"
#include "leveldb/table.h"
#include "leveldb/table_builder.h"
#include "port/port.h"
#include "table/block.h"
#include "table/merger.h"
#include "table/two_level_iterator.h"
#include "util/coding.h"
#include "util/logging.h"
#include "util/mutexlock.h"
namespace leveldb {
// Information kept for every waiting writer
struct DBImpl::Writer {
Status status;
WriteBatch* batch;
bool sync;
bool done;
port::CondVar cv;
explicit Writer(port::Mutex* mu) : cv(mu) { }
};
struct DBImpl::CompactionState {
Compaction* const compaction;
// Sequence numbers < smallest_snapshot are not significant since we
// will never have to service a snapshot below smallest_snapshot.
// Therefore if we have seen a sequence number S <= smallest_snapshot,
// we can drop all entries for the same key with sequence numbers < S.
SequenceNumber smallest_snapshot;
// Files produced by compaction
struct Output {
uint64_t number;
uint64_t file_size;
InternalKey smallest, largest;
};
std::vector<Output> outputs;
// State kept for output being generated
WritableFile* outfile;
TableBuilder* builder;
uint64_t total_bytes;
Output* current_output() { return &outputs[outputs.size()-1]; }
explicit CompactionState(Compaction* c)
: compaction(c),
outfile(NULL),
builder(NULL),
total_bytes(0) {
}
};
// Fix user-supplied options to be reasonable
template <class T,class V>
static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
}
Options SanitizeOptions(const std::string& dbname,
const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const Options& src) {
Options result = src;
result.comparator = icmp;
result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
ClipToRange(&result.max_open_files, 20, 50000);
ClipToRange(&result.write_buffer_size, 64<<10, 1<<30);
ClipToRange(&result.block_size, 1<<10, 4<<20);
if (result.info_log == NULL) {
// Open a log file in the same directory as the db
src.env->CreateDir(dbname); // In case it does not exist
src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
if (!s.ok()) {
// No place suitable for logging
result.info_log = NULL;
}
}
if (result.block_cache == NULL) {
result.block_cache = NewLRUCache(8 << 20);
}
return result;
}
DBImpl::DBImpl(const Options& options, const std::string& dbname)
: env_(options.env),
internal_comparator_(options.comparator),
internal_filter_policy_(options.filter_policy),
options_(SanitizeOptions(
dbname, &internal_comparator_, &internal_filter_policy_, options)),
owns_info_log_(options_.info_log != options.info_log),
owns_cache_(options_.block_cache != options.block_cache),
dbname_(dbname),
db_lock_(NULL),
shutting_down_(NULL),
bg_cv_(&mutex_),
mem_(new MemTable(internal_comparator_)),
imm_(NULL),
logfile_(NULL),
logfile_number_(0),
log_(NULL),
tmp_batch_(new WriteBatch),
bg_compaction_scheduled_(false),
manual_compaction_(NULL) {
mem_->Ref();
has_imm_.Release_Store(NULL);
// Reserve ten files or so for other uses and give the rest to TableCache.
const int table_cache_size = options.max_open_files - 10;
table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
versions_ = new VersionSet(dbname_, &options_, table_cache_,
&internal_comparator_);
}
DBImpl::~DBImpl() {
// Wait for background work to finish
mutex_.Lock();
shutting_down_.Release_Store(this); // Any non-NULL value is ok
while (bg_compaction_scheduled_) {
bg_cv_.Wait();
}
mutex_.Unlock();
if (db_lock_ != NULL) {
env_->UnlockFile(db_lock_);
}
delete versions_;
if (mem_ != NULL) mem_->Unref();
if (imm_ != NULL) imm_->Unref();
delete tmp_batch_;
delete log_;
delete logfile_;
delete table_cache_;
if (owns_info_log_) {
delete options_.info_log;
}
if (owns_cache_) {
delete options_.block_cache;
}
}
Status DBImpl::NewDB() {
VersionEdit new_db;
new_db.SetComparatorName(user_comparator()->Name());
new_db.SetLogNumber(0);
new_db.SetNextFile(2);
new_db.SetLastSequence(0);
const std::string manifest = DescriptorFileName(dbname_, 1);
WritableFile* file;
Status s = env_->NewWritableFile(manifest, &file);
if (!s.ok()) {
return s;
}
{
log::Writer log(file);
std::string record;
new_db.EncodeTo(&record);
s = log.AddRecord(record);
if (s.ok()) {
s = file->Close();
}
}
delete file;
if (s.ok()) {
// Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(env_, dbname_, 1);
} else {
env_->DeleteFile(manifest);
}
return s;
}
void DBImpl::MaybeIgnoreError(Status* s) const {
if (s->ok() || options_.paranoid_checks) {
// No change needed
} else {
Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
*s = Status::OK();
}
}
void DBImpl::DeleteObsoleteFiles() {
// Make a set of all of the live files
std::set<uint64_t> live = pending_outputs_;
versions_->AddLiveFiles(&live);
std::vector<std::string> filenames;
env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) {
bool keep = true;
switch (type) {
case kLogFile:
keep = ((number >= versions_->LogNumber()) ||
(number == versions_->PrevLogNumber()));
break;
case kDescriptorFile:
// Keep my manifest file, and any newer incarnations'
// (in case there is a race that allows other incarnations)
keep = (number >= versions_->ManifestFileNumber());
break;
case kTableFile:
keep = (live.find(number) != live.end());
break;
case kTempFile:
// Any temp files that are currently being written to must
// be recorded in pending_outputs_, which is inserted into "live"
keep = (live.find(number) != live.end());
break;
case kCurrentFile:
case kDBLockFile:
case kInfoLogFile:
keep = true;
break;
}
if (!keep) {
if (type == kTableFile) {
table_cache_->Evict(number);
}
Log(options_.info_log, "Delete type=%d #%lld\n",
int(type),
static_cast<unsigned long long>(number));
env_->DeleteFile(dbname_ + "/" + filenames[i]);
}
}
}
}
Status DBImpl::Recover(VersionEdit* edit) {
mutex_.AssertHeld();
// Ignore error from CreateDir since the creation of the DB is
// committed only when the descriptor is created, and this directory
// may already exist from a previous failed creation attempt.
env_->CreateDir(dbname_);
assert(db_lock_ == NULL);
Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
if (!s.ok()) {
return s;
}
if (!env_->FileExists(CurrentFileName(dbname_))) {
if (options_.create_if_missing) {
s = NewDB();
if (!s.ok()) {
return s;
}
} else {
return Status::InvalidArgument(
dbname_, "does not exist (create_if_missing is false)");
}
} else {
if (options_.error_if_exists) {
return Status::InvalidArgument(
dbname_, "exists (error_if_exists is true)");
}
}
s = versions_->Recover();
if (s.ok()) {
SequenceNumber max_sequence(0);
// Recover from all newer log files than the ones named in the
// descriptor (new log files may have been added by the previous
// incarnation without registering them in the descriptor).
//
// Note that PrevLogNumber() is no longer used, but we pay
// attention to it in case we are recovering a database
// produced by an older version of leveldb.
const uint64_t min_log = versions_->LogNumber();
const uint64_t prev_log = versions_->PrevLogNumber();
std::vector<std::string> filenames;
s = env_->GetChildren(dbname_, &filenames);
if (!s.ok()) {
return s;
}
uint64_t number;
FileType type;
std::vector<uint64_t> logs;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)
&& type == kLogFile
&& ((number >= min_log) || (number == prev_log))) {
logs.push_back(number);
}
}
// Recover in the order in which the logs were generated
std::sort(logs.begin(), logs.end());
for (size_t i = 0; i < logs.size(); i++) {
s = RecoverLogFile(logs[i], edit, &max_sequence);
// The previous incarnation may not have written any MANIFEST
// records after allocating this log number. So we manually
// update the file number allocation counter in VersionSet.
versions_->MarkFileNumberUsed(logs[i]);
}
if (s.ok()) {
if (versions_->LastSequence() < max_sequence) {
versions_->SetLastSequence(max_sequence);
}
}
}
return s;
}
Status DBImpl::RecoverLogFile(uint64_t log_number,
VersionEdit* edit,
SequenceNumber* max_sequence) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
Logger* info_log;
const char* fname;
Status* status; // NULL if options_.paranoid_checks==false
virtual void Corruption(size_t bytes, const Status& s) {
Log(info_log, "%s%s: dropping %d bytes; %s",
(this->status == NULL ? "(ignoring error) " : ""),
fname, static_cast<int>(bytes), s.ToString().c_str());
if (this->status != NULL && this->status->ok()) *this->status = s;
}
};
mutex_.AssertHeld();
// Open the log file
std::string fname = LogFileName(dbname_, log_number);
SequentialFile* file;
Status status = env_->NewSequentialFile(fname, &file);
if (!status.ok()) {
MaybeIgnoreError(&status);
return status;
}
// Create the log reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.fname = fname.c_str();
reporter.status = (options_.paranoid_checks ? &status : NULL);
// We intentially make log::Reader do checksumming even if
// paranoid_checks==false so that corruptions cause entire commits
// to be skipped instead of propagating bad information (like overly
// large sequence numbers).
log::Reader reader(file, &reporter, true/*checksum*/,
0/*initial_offset*/);
Log(options_.info_log, "Recovering log #%llu",
(unsigned long long) log_number);
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
MemTable* mem = NULL;
while (reader.ReadRecord(&record, &scratch) &&
status.ok()) {
if (record.size() < 12) {
reporter.Corruption(
record.size(), Status::Corruption("log record too small"));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
if (mem == NULL) {
mem = new MemTable(internal_comparator_);
mem->Ref();
}
status = WriteBatchInternal::InsertInto(&batch, mem);
MaybeIgnoreError(&status);
if (!status.ok()) {
break;
}
const SequenceNumber last_seq =
WriteBatchInternal::Sequence(&batch) +
WriteBatchInternal::Count(&batch) - 1;
if (last_seq > *max_sequence) {
*max_sequence = last_seq;
}
if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
status = WriteLevel0Table(mem, edit, NULL);
if (!status.ok()) {
// Reflect errors immediately so that conditions like full
// file-systems cause the DB::Open() to fail.
break;
}
mem->Unref();
mem = NULL;
}
}
if (status.ok() && mem != NULL) {
status = WriteLevel0Table(mem, edit, NULL);
// Reflect errors immediately so that conditions like full
// file-systems cause the DB::Open() to fail.
}
if (mem != NULL) mem->Unref();
delete file;
return status;
}
Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
Version* base) {
mutex_.AssertHeld();
const uint64_t start_micros = env_->NowMicros();
FileMetaData meta;
meta.number = versions_->NewFileNumber();
pending_outputs_.insert(meta.number);
Iterator* iter = mem->NewIterator();
Log(options_.info_log, "Level-0 table #%llu: started",
(unsigned long long) meta.number);
Status s;
{
mutex_.Unlock();
s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
mutex_.Lock();
}
Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
(unsigned long long) meta.number,
(unsigned long long) meta.file_size,
s.ToString().c_str());
delete iter;
pending_outputs_.erase(meta.number);
// Note that if file_size is zero, the file has been deleted and
// should not be added to the manifest.
int level = 0;
if (s.ok() && meta.file_size > 0) {
const Slice min_user_key = meta.smallest.user_key();
const Slice max_user_key = meta.largest.user_key();
if (base != NULL) {
level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
}
edit->AddFile(level, meta.number, meta.file_size,
meta.smallest, meta.largest);
}
CompactionStats stats;
stats.micros = env_->NowMicros() - start_micros;
stats.bytes_written = meta.file_size;
stats_[level].Add(stats);
return s;
}
Status DBImpl::CompactMemTable() {
mutex_.AssertHeld();
assert(imm_ != NULL);
// Save the contents of the memtable as a new Table
VersionEdit edit;
Version* base = versions_->current();
base->Ref();
Status s = WriteLevel0Table(imm_, &edit, base);
base->Unref();
if (s.ok() && shutting_down_.Acquire_Load()) {
s = Status::IOError("Deleting DB during memtable compaction");
}
// Replace immutable memtable with the generated Table
if (s.ok()) {
edit.SetPrevLogNumber(0);
edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
s = versions_->LogAndApply(&edit, &mutex_);
}
if (s.ok()) {
// Commit to the new state
imm_->Unref();
imm_ = NULL;
has_imm_.Release_Store(NULL);
DeleteObsoleteFiles();
}
return s;
}
void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
int max_level_with_files = 1;
{
MutexLock l(&mutex_);
Version* base = versions_->current();
for (int level = 1; level < config::kNumLevels; level++) {
if (base->OverlapInLevel(level, begin, end)) {
max_level_with_files = level;
}
}
}
TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
for (int level = 0; level < max_level_with_files; level++) {
TEST_CompactRange(level, begin, end);
}
}
void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
assert(level >= 0);
assert(level + 1 < config::kNumLevels);
InternalKey begin_storage, end_storage;
ManualCompaction manual;
manual.level = level;
manual.done = false;
if (begin == NULL) {
manual.begin = NULL;
} else {
begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
manual.begin = &begin_storage;
}
if (end == NULL) {
manual.end = NULL;
} else {
end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
manual.end = &end_storage;
}
MutexLock l(&mutex_);
while (!manual.done) {
while (manual_compaction_ != NULL) {
bg_cv_.Wait();
}
manual_compaction_ = &manual;
MaybeScheduleCompaction();
while (manual_compaction_ == &manual) {
bg_cv_.Wait();
}
}
}
Status DBImpl::TEST_CompactMemTable() {
// NULL batch means just wait for earlier writes to be done
Status s = Write(WriteOptions(), NULL);
if (s.ok()) {
// Wait until the compaction completes
MutexLock l(&mutex_);
while (imm_ != NULL && bg_error_.ok()) {
bg_cv_.Wait();
}
if (imm_ != NULL) {
s = bg_error_;
}
}
return s;
}
void DBImpl::MaybeScheduleCompaction() {
mutex_.AssertHeld();
if (bg_compaction_scheduled_) {
// Already scheduled
} else if (shutting_down_.Acquire_Load()) {
// DB is being deleted; no more background compactions
} else if (imm_ == NULL &&
manual_compaction_ == NULL &&
!versions_->NeedsCompaction()) {
// No work to be done
} else {
bg_compaction_scheduled_ = true;
env_->Schedule(&DBImpl::BGWork, this);
}
}
void DBImpl::BGWork(void* db) {
reinterpret_cast<DBImpl*>(db)->BackgroundCall();
}
void DBImpl::BackgroundCall() {
MutexLock l(&mutex_);
assert(bg_compaction_scheduled_);
if (!shutting_down_.Acquire_Load()) {
Status s = BackgroundCompaction();
- if (!s.ok()) {
+ if (s.ok()) {
+ // Success
+ } else if (shutting_down_.Acquire_Load()) {
+ // Error most likely due to shutdown; do not wait
+ } else {
// Wait a little bit before retrying background compaction in
// case this is an environmental problem and we do not want to
// chew up resources for failed compactions for the duration of
// the problem.
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
Log(options_.info_log, "Waiting after background compaction error: %s",
s.ToString().c_str());
mutex_.Unlock();
env_->SleepForMicroseconds(1000000);
mutex_.Lock();
}
}
bg_compaction_scheduled_ = false;
// Previous compaction may have produced too many files in a level,
// so reschedule another compaction if needed.
MaybeScheduleCompaction();
bg_cv_.SignalAll();
}
Status DBImpl::BackgroundCompaction() {
mutex_.AssertHeld();
if (imm_ != NULL) {
return CompactMemTable();
}
Compaction* c;
bool is_manual = (manual_compaction_ != NULL);
InternalKey manual_end;
if (is_manual) {
ManualCompaction* m = manual_compaction_;
c = versions_->CompactRange(m->level, m->begin, m->end);
m->done = (c == NULL);
if (c != NULL) {
manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
}
Log(options_.info_log,
"Manual compaction at level-%d from %s .. %s; will stop at %s\n",
m->level,
(m->begin ? m->begin->DebugString().c_str() : "(begin)"),
(m->end ? m->end->DebugString().c_str() : "(end)"),
(m->done ? "(end)" : manual_end.DebugString().c_str()));
} else {
c = versions_->PickCompaction();
}
Status status;
if (c == NULL) {
// Nothing to do
} else if (!is_manual && c->IsTrivialMove()) {
// Move file to next level
assert(c->num_input_files(0) == 1);
FileMetaData* f = c->input(0, 0);
c->edit()->DeleteFile(c->level(), f->number);
c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
f->smallest, f->largest);
status = versions_->LogAndApply(c->edit(), &mutex_);
VersionSet::LevelSummaryStorage tmp;
Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
static_cast<unsigned long long>(f->number),
c->level() + 1,
static_cast<unsigned long long>(f->file_size),
status.ToString().c_str(),
versions_->LevelSummary(&tmp));
} else {
CompactionState* compact = new CompactionState(c);
status = DoCompactionWork(compact);
CleanupCompaction(compact);
c->ReleaseInputs();
DeleteObsoleteFiles();
}
delete c;
if (status.ok()) {
// Done
} else if (shutting_down_.Acquire_Load()) {
// Ignore compaction errors found during shutting down
} else {
Log(options_.info_log,
"Compaction error: %s", status.ToString().c_str());
if (options_.paranoid_checks && bg_error_.ok()) {
bg_error_ = status;
}
}
if (is_manual) {
ManualCompaction* m = manual_compaction_;
if (!status.ok()) {
m->done = true;
}
if (!m->done) {
// We only compacted part of the requested range. Update *m
// to the range that is left to be compacted.
m->tmp_storage = manual_end;
m->begin = &m->tmp_storage;
}
manual_compaction_ = NULL;
}
return status;
}
void DBImpl::CleanupCompaction(CompactionState* compact) {
mutex_.AssertHeld();
if (compact->builder != NULL) {
// May happen if we get a shutdown call in the middle of compaction
compact->builder->Abandon();
delete compact->builder;
} else {
assert(compact->outfile == NULL);
}
delete compact->outfile;
for (size_t i = 0; i < compact->outputs.size(); i++) {
const CompactionState::Output& out = compact->outputs[i];
pending_outputs_.erase(out.number);
}
delete compact;
}
Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
assert(compact != NULL);
assert(compact->builder == NULL);
uint64_t file_number;
{
mutex_.Lock();
file_number = versions_->NewFileNumber();
pending_outputs_.insert(file_number);
CompactionState::Output out;
out.number = file_number;
out.smallest.Clear();
out.largest.Clear();
compact->outputs.push_back(out);
mutex_.Unlock();
}
// Make the output file
std::string fname = TableFileName(dbname_, file_number);
Status s = env_->NewWritableFile(fname, &compact->outfile);
if (s.ok()) {
compact->builder = new TableBuilder(options_, compact->outfile);
}
return s;
}
Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
Iterator* input) {
assert(compact != NULL);
assert(compact->outfile != NULL);
assert(compact->builder != NULL);
const uint64_t output_number = compact->current_output()->number;
assert(output_number != 0);
// Check for iterator errors
Status s = input->status();
const uint64_t current_entries = compact->builder->NumEntries();
if (s.ok()) {
s = compact->builder->Finish();
} else {
compact->builder->Abandon();
}
const uint64_t current_bytes = compact->builder->FileSize();
compact->current_output()->file_size = current_bytes;
compact->total_bytes += current_bytes;
delete compact->builder;
compact->builder = NULL;
// Finish and check for file errors
if (s.ok()) {
s = compact->outfile->Sync();
}
if (s.ok()) {
s = compact->outfile->Close();
}
delete compact->outfile;
compact->outfile = NULL;
if (s.ok() && current_entries > 0) {
// Verify that the table is usable
Iterator* iter = table_cache_->NewIterator(ReadOptions(),
output_number,
current_bytes);
s = iter->status();
delete iter;
if (s.ok()) {
Log(options_.info_log,
"Generated table #%llu: %lld keys, %lld bytes",
(unsigned long long) output_number,
(unsigned long long) current_entries,
(unsigned long long) current_bytes);
}
}
return s;
}
Status DBImpl::InstallCompactionResults(CompactionState* compact) {
mutex_.AssertHeld();
Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
compact->compaction->num_input_files(0),
compact->compaction->level(),
compact->compaction->num_input_files(1),
compact->compaction->level() + 1,
static_cast<long long>(compact->total_bytes));
// Add compaction outputs
compact->compaction->AddInputDeletions(compact->compaction->edit());
const int level = compact->compaction->level();
for (size_t i = 0; i < compact->outputs.size(); i++) {
const CompactionState::Output& out = compact->outputs[i];
compact->compaction->edit()->AddFile(
level + 1,
out.number, out.file_size, out.smallest, out.largest);
}
return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
}
Status DBImpl::DoCompactionWork(CompactionState* compact) {
const uint64_t start_micros = env_->NowMicros();
int64_t imm_micros = 0; // Micros spent doing imm_ compactions
Log(options_.info_log, "Compacting %d@%d + %d@%d files",
compact->compaction->num_input_files(0),
compact->compaction->level(),
compact->compaction->num_input_files(1),
compact->compaction->level() + 1);
assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
assert(compact->builder == NULL);
assert(compact->outfile == NULL);
if (snapshots_.empty()) {
compact->smallest_snapshot = versions_->LastSequence();
} else {
compact->smallest_snapshot = snapshots_.oldest()->number_;
}
// Release mutex while we're actually doing the compaction work
mutex_.Unlock();
Iterator* input = versions_->MakeInputIterator(compact->compaction);
input->SeekToFirst();
Status status;
ParsedInternalKey ikey;
std::string current_user_key;
bool has_current_user_key = false;
SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
// Prioritize immutable compaction work
if (has_imm_.NoBarrier_Load() != NULL) {
const uint64_t imm_start = env_->NowMicros();
mutex_.Lock();
if (imm_ != NULL) {
CompactMemTable();
bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
}
mutex_.Unlock();
imm_micros += (env_->NowMicros() - imm_start);
}
Slice key = input->key();
if (compact->compaction->ShouldStopBefore(key) &&
compact->builder != NULL) {
status = FinishCompactionOutputFile(compact, input);
if (!status.ok()) {
break;
}
}
// Handle key/value, add to state, etc.
bool drop = false;
if (!ParseInternalKey(key, &ikey)) {
// Do not hide error keys
current_user_key.clear();
has_current_user_key = false;
last_sequence_for_key = kMaxSequenceNumber;
} else {
if (!has_current_user_key ||
user_comparator()->Compare(ikey.user_key,
Slice(current_user_key)) != 0) {
// First occurrence of this user key
current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
has_current_user_key = true;
last_sequence_for_key = kMaxSequenceNumber;
}
if (last_sequence_for_key <= compact->smallest_snapshot) {
// Hidden by an newer entry for same user key
drop = true; // (A)
} else if (ikey.type == kTypeDeletion &&
ikey.sequence <= compact->smallest_snapshot &&
compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
// For this user key:
// (1) there is no data in higher levels
// (2) data in lower levels will have larger sequence numbers
// (3) data in layers that are being compacted here and have
// smaller sequence numbers will be dropped in the next
// few iterations of this loop (by rule (A) above).
// Therefore this deletion marker is obsolete and can be dropped.
drop = true;
}
last_sequence_for_key = ikey.sequence;
}
#if 0
Log(options_.info_log,
" Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
"%d smallest_snapshot: %d",
ikey.user_key.ToString().c_str(),
(int)ikey.sequence, ikey.type, kTypeValue, drop,
compact->compaction->IsBaseLevelForKey(ikey.user_key),
(int)last_sequence_for_key, (int)compact->smallest_snapshot);
#endif
if (!drop) {
// Open output file if necessary
if (compact->builder == NULL) {
status = OpenCompactionOutputFile(compact);
if (!status.ok()) {
break;
}
}
if (compact->builder->NumEntries() == 0) {
compact->current_output()->smallest.DecodeFrom(key);
}
compact->current_output()->largest.DecodeFrom(key);
compact->builder->Add(key, input->value());
// Close output file if it is big enough
if (compact->builder->FileSize() >=
compact->compaction->MaxOutputFileSize()) {
status = FinishCompactionOutputFile(compact, input);
if (!status.ok()) {
break;
}
}
}
input->Next();
}
if (status.ok() && shutting_down_.Acquire_Load()) {
status = Status::IOError("Deleting DB during compaction");
}
if (status.ok() && compact->builder != NULL) {
status = FinishCompactionOutputFile(compact, input);
}
if (status.ok()) {
status = input->status();
}
delete input;
input = NULL;
CompactionStats stats;
stats.micros = env_->NowMicros() - start_micros - imm_micros;
for (int which = 0; which < 2; which++) {
for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
stats.bytes_read += compact->compaction->input(which, i)->file_size;
}
}
for (size_t i = 0; i < compact->outputs.size(); i++) {
stats.bytes_written += compact->outputs[i].file_size;
}
mutex_.Lock();
stats_[compact->compaction->level() + 1].Add(stats);
if (status.ok()) {
status = InstallCompactionResults(compact);
}
VersionSet::LevelSummaryStorage tmp;
Log(options_.info_log,
"compacted to: %s", versions_->LevelSummary(&tmp));
return status;
}
namespace {
struct IterState {
port::Mutex* mu;
Version* version;
MemTable* mem;
MemTable* imm;
};
static void CleanupIteratorState(void* arg1, void* arg2) {
IterState* state = reinterpret_cast<IterState*>(arg1);
state->mu->Lock();
state->mem->Unref();
if (state->imm != NULL) state->imm->Unref();
state->version->Unref();
state->mu->Unlock();
delete state;
}
} // namespace
Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
SequenceNumber* latest_snapshot) {
IterState* cleanup = new IterState;
mutex_.Lock();
*latest_snapshot = versions_->LastSequence();
// Collect together all needed child iterators
std::vector<Iterator*> list;
list.push_back(mem_->NewIterator());
mem_->Ref();
if (imm_ != NULL) {
list.push_back(imm_->NewIterator());
imm_->Ref();
}
versions_->current()->AddIterators(options, &list);
Iterator* internal_iter =
NewMergingIterator(&internal_comparator_, &list[0], list.size());
versions_->current()->Ref();
cleanup->mu = &mutex_;
cleanup->mem = mem_;
cleanup->imm = imm_;
cleanup->version = versions_->current();
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);
mutex_.Unlock();
return internal_iter;
}
Iterator* DBImpl::TEST_NewInternalIterator() {
SequenceNumber ignored;
return NewInternalIterator(ReadOptions(), &ignored);
}
int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
MutexLock l(&mutex_);
return versions_->MaxNextLevelOverlappingBytes();
}
Status DBImpl::Get(const ReadOptions& options,
const Slice& key,
std::string* value) {
Status s;
MutexLock l(&mutex_);
SequenceNumber snapshot;
if (options.snapshot != NULL) {
snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
} else {
snapshot = versions_->LastSequence();
}
MemTable* mem = mem_;
MemTable* imm = imm_;
Version* current = versions_->current();
mem->Ref();
if (imm != NULL) imm->Ref();
current->Ref();
bool have_stat_update = false;
Version::GetStats stats;
// Unlock while reading from files and memtables
{
mutex_.Unlock();
// First look in the memtable, then in the immutable memtable (if any).
LookupKey lkey(key, snapshot);
if (mem->Get(lkey, value, &s)) {
// Done
} else if (imm != NULL && imm->Get(lkey, value, &s)) {
// Done
} else {
s = current->Get(options, lkey, value, &stats);
have_stat_update = true;
}
mutex_.Lock();
}
if (have_stat_update && current->UpdateStats(stats)) {
MaybeScheduleCompaction();
}
mem->Unref();
if (imm != NULL) imm->Unref();
current->Unref();
return s;
}
Iterator* DBImpl::NewIterator(const ReadOptions& options) {
SequenceNumber latest_snapshot;
Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
return NewDBIterator(
&dbname_, env_, user_comparator(), internal_iter,
(options.snapshot != NULL
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
: latest_snapshot));
}
const Snapshot* DBImpl::GetSnapshot() {
MutexLock l(&mutex_);
return snapshots_.New(versions_->LastSequence());
}
void DBImpl::ReleaseSnapshot(const Snapshot* s) {
MutexLock l(&mutex_);
snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
}
// Convenience methods
Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
return DB::Put(o, key, val);
}
Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
return DB::Delete(options, key);
}
Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
Writer w(&mutex_);
w.batch = my_batch;
w.sync = options.sync;
w.done = false;
MutexLock l(&mutex_);
writers_.push_back(&w);
while (!w.done && &w != writers_.front()) {
w.cv.Wait();
}
if (w.done) {
return w.status;
}
// May temporarily unlock and wait.
Status status = MakeRoomForWrite(my_batch == NULL);
uint64_t last_sequence = versions_->LastSequence();
Writer* last_writer = &w;
if (status.ok() && my_batch != NULL) { // NULL batch is for compactions
WriteBatch* updates = BuildBatchGroup(&last_writer);
WriteBatchInternal::SetSequence(updates, last_sequence + 1);
last_sequence += WriteBatchInternal::Count(updates);
// Add to log and apply to memtable. We can release the lock
// during this phase since &w is currently responsible for logging
// and protects against concurrent loggers and concurrent writes
// into mem_.
{
mutex_.Unlock();
status = log_->AddRecord(WriteBatchInternal::Contents(updates));
if (status.ok() && options.sync) {
status = logfile_->Sync();
}
if (status.ok()) {
status = WriteBatchInternal::InsertInto(updates, mem_);
}
mutex_.Lock();
}
if (updates == tmp_batch_) tmp_batch_->Clear();
versions_->SetLastSequence(last_sequence);
}
while (true) {
Writer* ready = writers_.front();
writers_.pop_front();
if (ready != &w) {
ready->status = status;
ready->done = true;
ready->cv.Signal();
}
if (ready == last_writer) break;
}
// Notify new head of write queue
if (!writers_.empty()) {
writers_.front()->cv.Signal();
}
return status;
}
// REQUIRES: Writer list must be non-empty
// REQUIRES: First writer must have a non-NULL batch
WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
assert(!writers_.empty());
Writer* first = writers_.front();
WriteBatch* result = first->batch;
assert(result != NULL);
size_t size = WriteBatchInternal::ByteSize(first->batch);
// Allow the group to grow up to a maximum size, but if the
// original write is small, limit the growth so we do not slow
// down the small write too much.
size_t max_size = 1 << 20;
if (size <= (128<<10)) {
max_size = size + (128<<10);
}
*last_writer = first;
std::deque<Writer*>::iterator iter = writers_.begin();
++iter; // Advance past "first"
for (; iter != writers_.end(); ++iter) {
Writer* w = *iter;
if (w->sync && !first->sync) {
// Do not include a sync write into a batch handled by a non-sync write.
break;
}
if (w->batch != NULL) {
size += WriteBatchInternal::ByteSize(w->batch);
if (size > max_size) {
// Do not make batch too big
break;
}
// Append to *reuslt
if (result == first->batch) {
// Switch to temporary batch instead of disturbing caller's batch
result = tmp_batch_;
assert(WriteBatchInternal::Count(result) == 0);
WriteBatchInternal::Append(result, first->batch);
}
WriteBatchInternal::Append(result, w->batch);
}
*last_writer = w;
}
return result;
}
// REQUIRES: mutex_ is held
// REQUIRES: this thread is currently at the front of the writer queue
Status DBImpl::MakeRoomForWrite(bool force) {
mutex_.AssertHeld();
assert(!writers_.empty());
bool allow_delay = !force;
Status s;
while (true) {
if (!bg_error_.ok()) {
// Yield previous error
s = bg_error_;
break;
} else if (
allow_delay &&
versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
// We are getting close to hitting a hard limit on the number of
// L0 files. Rather than delaying a single write by several
// seconds when we hit the hard limit, start delaying each
// individual write by 1ms to reduce latency variance. Also,
// this delay hands over some CPU to the compaction thread in
// case it is sharing the same core as the writer.
mutex_.Unlock();
env_->SleepForMicroseconds(1000);
allow_delay = false; // Do not delay a single write more than once
mutex_.Lock();
} else if (!force &&
(mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
// There is room in current memtable
break;
} else if (imm_ != NULL) {
// We have filled up the current memtable, but the previous
// one is still being compacted, so we wait.
bg_cv_.Wait();
} else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
// There are too many level-0 files.
Log(options_.info_log, "waiting...\n");
bg_cv_.Wait();
} else {
// Attempt to switch to a new memtable and trigger compaction of old
assert(versions_->PrevLogNumber() == 0);
uint64_t new_log_number = versions_->NewFileNumber();
WritableFile* lfile = NULL;
s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
if (!s.ok()) {
// Avoid chewing through file number space in a tight loop.
versions_->ReuseFileNumber(new_log_number);
break;
}
delete log_;
delete logfile_;
logfile_ = lfile;
logfile_number_ = new_log_number;
log_ = new log::Writer(lfile);
imm_ = mem_;
has_imm_.Release_Store(imm_);
mem_ = new MemTable(internal_comparator_);
mem_->Ref();
force = false; // Do not force another compaction if have room
MaybeScheduleCompaction();
}
}
return s;
}
bool DBImpl::GetProperty(const Slice& property, std::string* value) {
value->clear();
MutexLock l(&mutex_);
Slice in = property;
Slice prefix("leveldb.");
if (!in.starts_with(prefix)) return false;
in.remove_prefix(prefix.size());
if (in.starts_with("num-files-at-level")) {
in.remove_prefix(strlen("num-files-at-level"));
uint64_t level;
bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
if (!ok || level >= config::kNumLevels) {
return false;
} else {
char buf[100];
snprintf(buf, sizeof(buf), "%d",
versions_->NumLevelFiles(static_cast<int>(level)));
*value = buf;
return true;
}
} else if (in == "stats") {
char buf[200];
snprintf(buf, sizeof(buf),
" Compactions\n"
"Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n"
"--------------------------------------------------\n"
);
value->append(buf);
for (int level = 0; level < config::kNumLevels; level++) {
int files = versions_->NumLevelFiles(level);
if (stats_[level].micros > 0 || files > 0) {
snprintf(
buf, sizeof(buf),
"%3d %8d %8.0f %9.0f %8.0f %9.0f\n",
level,
files,
versions_->NumLevelBytes(level) / 1048576.0,
stats_[level].micros / 1e6,
stats_[level].bytes_read / 1048576.0,
stats_[level].bytes_written / 1048576.0);
value->append(buf);
}
}
return true;
} else if (in == "sstables") {
*value = versions_->current()->DebugString();
return true;
}
return false;
}
void DBImpl::GetApproximateSizes(
const Range* range, int n,
uint64_t* sizes) {
// TODO(opt): better implementation
Version* v;
{
MutexLock l(&mutex_);
versions_->current()->Ref();
v = versions_->current();
}
for (int i = 0; i < n; i++) {
// Convert user_key into a corresponding internal key.
InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
uint64_t start = versions_->ApproximateOffsetOf(v, k1);
uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
sizes[i] = (limit >= start ? limit - start : 0);
}
{
MutexLock l(&mutex_);
v->Unref();
}
}
// Default implementations of convenience methods that subclasses of DB
// can call if they wish
Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
WriteBatch batch;
batch.Put(key, value);
return Write(opt, &batch);
}
Status DB::Delete(const WriteOptions& opt, const Slice& key) {
WriteBatch batch;
batch.Delete(key);
return Write(opt, &batch);
}
DB::~DB() { }
Status DB::Open(const Options& options, const std::string& dbname,
DB** dbptr) {
*dbptr = NULL;
DBImpl* impl = new DBImpl(options, dbname);
impl->mutex_.Lock();
VersionEdit edit;
Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
if (s.ok()) {
uint64_t new_log_number = impl->versions_->NewFileNumber();
WritableFile* lfile;
s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
&lfile);
if (s.ok()) {
edit.SetLogNumber(new_log_number);
impl->logfile_ = lfile;
impl->logfile_number_ = new_log_number;
impl->log_ = new log::Writer(lfile);
s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
}
if (s.ok()) {
impl->DeleteObsoleteFiles();
impl->MaybeScheduleCompaction();
}
}
impl->mutex_.Unlock();
if (s.ok()) {
*dbptr = impl;
} else {
delete impl;
}
return s;
}
Snapshot::~Snapshot() {
}
Status DestroyDB(const std::string& dbname, const Options& options) {
Env* env = options.env;
std::vector<std::string> filenames;
// Ignore error in case directory does not exist
env->GetChildren(dbname, &filenames);
if (filenames.empty()) {
return Status::OK();
}
FileLock* lock;
const std::string lockname = LockFileName(dbname);
Status result = env->LockFile(lockname, &lock);
if (result.ok()) {
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) &&
type != kDBLockFile) { // Lock file will be deleted at end
Status del = env->DeleteFile(dbname + "/" + filenames[i]);
if (result.ok() && !del.ok()) {
result = del;
}
}
}
env->UnlockFile(lock); // Ignore error since state is already gone
env->DeleteFile(lockname);
env->DeleteDir(dbname); // Ignore error in case dir contains other files
}
return result;
}
} // namespace leveldb
diff --git a/src/leveldb/db/db_impl.h b/src/leveldb/db/db_impl.h
index 8d2bb3405..bd29dd805 100644
--- a/src/leveldb/db/db_impl.h
+++ b/src/leveldb/db/db_impl.h
@@ -1,194 +1,202 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_
#define STORAGE_LEVELDB_DB_DB_IMPL_H_
#include <deque>
#include <set>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "port/port.h"
+#include "port/thread_annotations.h"
namespace leveldb {
class MemTable;
class TableCache;
class Version;
class VersionEdit;
class VersionSet;
class DBImpl : public DB {
public:
DBImpl(const Options& options, const std::string& dbname);
virtual ~DBImpl();
// Implementations of the DB interface
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
virtual Status Delete(const WriteOptions&, const Slice& key);
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
virtual Status Get(const ReadOptions& options,
const Slice& key,
std::string* value);
virtual Iterator* NewIterator(const ReadOptions&);
virtual const Snapshot* GetSnapshot();
virtual void ReleaseSnapshot(const Snapshot* snapshot);
virtual bool GetProperty(const Slice& property, std::string* value);
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
virtual void CompactRange(const Slice* begin, const Slice* end);
// Extra methods (for testing) that are not in the public DB interface
// Compact any files in the named level that overlap [*begin,*end]
void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
// Force current memtable contents to be compacted.
Status TEST_CompactMemTable();
// Return an internal iterator over the current state of the database.
// The keys of this iterator are internal keys (see format.h).
// The returned iterator should be deleted when no longer needed.
Iterator* TEST_NewInternalIterator();
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
int64_t TEST_MaxNextLevelOverlappingBytes();
private:
friend class DB;
struct CompactionState;
struct Writer;
Iterator* NewInternalIterator(const ReadOptions&,
SequenceNumber* latest_snapshot);
Status NewDB();
// Recover the descriptor from persistent storage. May do a significant
// amount of work to recover recently logged updates. Any changes to
// be made to the descriptor are added to *edit.
- Status Recover(VersionEdit* edit);
+ Status Recover(VersionEdit* edit) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
void MaybeIgnoreError(Status* s) const;
// Delete any unneeded files and stale in-memory entries.
void DeleteObsoleteFiles();
// Compact the in-memory write buffer to disk. Switches to a new
// log-file/memtable and writes a new descriptor iff successful.
- Status CompactMemTable();
+ Status CompactMemTable()
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status RecoverLogFile(uint64_t log_number,
VersionEdit* edit,
- SequenceNumber* max_sequence);
+ SequenceNumber* max_sequence)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
- Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base);
+ Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
- Status MakeRoomForWrite(bool force /* compact even if there is room? */);
+ Status MakeRoomForWrite(bool force /* compact even if there is room? */)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
WriteBatch* BuildBatchGroup(Writer** last_writer);
- void MaybeScheduleCompaction();
+ void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
static void BGWork(void* db);
void BackgroundCall();
- Status BackgroundCompaction();
- void CleanupCompaction(CompactionState* compact);
- Status DoCompactionWork(CompactionState* compact);
+ Status BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ void CleanupCompaction(CompactionState* compact)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ Status DoCompactionWork(CompactionState* compact)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status OpenCompactionOutputFile(CompactionState* compact);
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
- Status InstallCompactionResults(CompactionState* compact);
+ Status InstallCompactionResults(CompactionState* compact)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Constant after construction
Env* const env_;
const InternalKeyComparator internal_comparator_;
const InternalFilterPolicy internal_filter_policy_;
const Options options_; // options_.comparator == &internal_comparator_
bool owns_info_log_;
bool owns_cache_;
const std::string dbname_;
// table_cache_ provides its own synchronization
TableCache* table_cache_;
// Lock over the persistent DB state. Non-NULL iff successfully acquired.
FileLock* db_lock_;
// State below is protected by mutex_
port::Mutex mutex_;
port::AtomicPointer shutting_down_;
port::CondVar bg_cv_; // Signalled when background work finishes
MemTable* mem_;
MemTable* imm_; // Memtable being compacted
port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_
WritableFile* logfile_;
uint64_t logfile_number_;
log::Writer* log_;
// Queue of writers.
std::deque<Writer*> writers_;
WriteBatch* tmp_batch_;
SnapshotList snapshots_;
// Set of table files to protect from deletion because they are
// part of ongoing compactions.
std::set<uint64_t> pending_outputs_;
// Has a background compaction been scheduled or is running?
bool bg_compaction_scheduled_;
// Information for a manual compaction
struct ManualCompaction {
int level;
bool done;
const InternalKey* begin; // NULL means beginning of key range
const InternalKey* end; // NULL means end of key range
InternalKey tmp_storage; // Used to keep track of compaction progress
};
ManualCompaction* manual_compaction_;
VersionSet* versions_;
// Have we encountered a background error in paranoid mode?
Status bg_error_;
// Per level compaction stats. stats_[level] stores the stats for
// compactions that produced data for the specified "level".
struct CompactionStats {
int64_t micros;
int64_t bytes_read;
int64_t bytes_written;
CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { }
void Add(const CompactionStats& c) {
this->micros += c.micros;
this->bytes_read += c.bytes_read;
this->bytes_written += c.bytes_written;
}
};
CompactionStats stats_[config::kNumLevels];
// No copying allowed
DBImpl(const DBImpl&);
void operator=(const DBImpl&);
const Comparator* user_comparator() const {
return internal_comparator_.user_comparator();
}
};
// Sanitize db options. The caller should delete result.info_log if
// it is not equal to src.info_log.
extern Options SanitizeOptions(const std::string& db,
const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const Options& src);
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_
diff --git a/src/leveldb/db/db_test.cc b/src/leveldb/db/db_test.cc
index 3744d0e11..74abd13e5 100644
--- a/src/leveldb/db/db_test.cc
+++ b/src/leveldb/db/db_test.cc
@@ -1,1946 +1,1952 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/db.h"
#include "leveldb/filter_policy.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "leveldb/cache.h"
#include "leveldb/env.h"
#include "leveldb/table.h"
#include "util/hash.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
static std::string RandomString(Random* rnd, int len) {
std::string r;
test::RandomString(rnd, len, &r);
return r;
}
namespace {
class AtomicCounter {
private:
port::Mutex mu_;
int count_;
public:
AtomicCounter() : count_(0) { }
void Increment() {
MutexLock l(&mu_);
count_++;
}
int Read() {
MutexLock l(&mu_);
return count_;
}
void Reset() {
MutexLock l(&mu_);
count_ = 0;
}
};
}
// Special Env used to delay background operations
class SpecialEnv : public EnvWrapper {
public:
// sstable Sync() calls are blocked while this pointer is non-NULL.
port::AtomicPointer delay_sstable_sync_;
// Simulate no-space errors while this pointer is non-NULL.
port::AtomicPointer no_space_;
// Simulate non-writable file system while this pointer is non-NULL
port::AtomicPointer non_writable_;
bool count_random_reads_;
AtomicCounter random_read_counter_;
AtomicCounter sleep_counter_;
explicit SpecialEnv(Env* base) : EnvWrapper(base) {
delay_sstable_sync_.Release_Store(NULL);
no_space_.Release_Store(NULL);
non_writable_.Release_Store(NULL);
count_random_reads_ = false;
}
Status NewWritableFile(const std::string& f, WritableFile** r) {
class SSTableFile : public WritableFile {
private:
SpecialEnv* env_;
WritableFile* base_;
public:
SSTableFile(SpecialEnv* env, WritableFile* base)
: env_(env),
base_(base) {
}
~SSTableFile() { delete base_; }
Status Append(const Slice& data) {
if (env_->no_space_.Acquire_Load() != NULL) {
// Drop writes on the floor
return Status::OK();
} else {
return base_->Append(data);
}
}
Status Close() { return base_->Close(); }
Status Flush() { return base_->Flush(); }
Status Sync() {
while (env_->delay_sstable_sync_.Acquire_Load() != NULL) {
env_->SleepForMicroseconds(100000);
}
return base_->Sync();
}
};
if (non_writable_.Acquire_Load() != NULL) {
return Status::IOError("simulated write error");
}
Status s = target()->NewWritableFile(f, r);
if (s.ok()) {
if (strstr(f.c_str(), ".sst") != NULL) {
*r = new SSTableFile(this, *r);
}
}
return s;
}
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
class CountingFile : public RandomAccessFile {
private:
RandomAccessFile* target_;
AtomicCounter* counter_;
public:
CountingFile(RandomAccessFile* target, AtomicCounter* counter)
: target_(target), counter_(counter) {
}
virtual ~CountingFile() { delete target_; }
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
counter_->Increment();
return target_->Read(offset, n, result, scratch);
}
};
Status s = target()->NewRandomAccessFile(f, r);
if (s.ok() && count_random_reads_) {
*r = new CountingFile(*r, &random_read_counter_);
}
return s;
}
virtual void SleepForMicroseconds(int micros) {
sleep_counter_.Increment();
target()->SleepForMicroseconds(micros);
}
};
class DBTest {
private:
const FilterPolicy* filter_policy_;
// Sequence of option configurations to try
enum OptionConfig {
kDefault,
kFilter,
kUncompressed,
kEnd
};
int option_config_;
public:
std::string dbname_;
SpecialEnv* env_;
DB* db_;
Options last_options_;
DBTest() : option_config_(kDefault),
env_(new SpecialEnv(Env::Default())) {
filter_policy_ = NewBloomFilterPolicy(10);
dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, Options());
db_ = NULL;
Reopen();
}
~DBTest() {
delete db_;
DestroyDB(dbname_, Options());
delete env_;
delete filter_policy_;
}
// Switch to a fresh database with the next option configuration to
// test. Return false if there are no more configurations to test.
bool ChangeOptions() {
option_config_++;
if (option_config_ >= kEnd) {
return false;
} else {
DestroyAndReopen();
return true;
}
}
// Return the current option configuration.
Options CurrentOptions() {
Options options;
switch (option_config_) {
case kFilter:
options.filter_policy = filter_policy_;
break;
case kUncompressed:
options.compression = kNoCompression;
break;
default:
break;
}
return options;
}
DBImpl* dbfull() {
return reinterpret_cast<DBImpl*>(db_);
}
void Reopen(Options* options = NULL) {
ASSERT_OK(TryReopen(options));
}
void Close() {
delete db_;
db_ = NULL;
}
void DestroyAndReopen(Options* options = NULL) {
delete db_;
db_ = NULL;
DestroyDB(dbname_, Options());
ASSERT_OK(TryReopen(options));
}
Status TryReopen(Options* options) {
delete db_;
db_ = NULL;
Options opts;
if (options != NULL) {
opts = *options;
} else {
opts = CurrentOptions();
opts.create_if_missing = true;
}
last_options_ = opts;
return DB::Open(opts, dbname_, &db_);
}
Status Put(const std::string& k, const std::string& v) {
return db_->Put(WriteOptions(), k, v);
}
Status Delete(const std::string& k) {
return db_->Delete(WriteOptions(), k);
}
std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
ReadOptions options;
options.snapshot = snapshot;
std::string result;
Status s = db_->Get(options, k, &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
// Return a string that contains all key,value pairs in order,
// formatted like "(k1->v1)(k2->v2)".
std::string Contents() {
std::vector<std::string> forward;
std::string result;
Iterator* iter = db_->NewIterator(ReadOptions());
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
std::string s = IterStatus(iter);
result.push_back('(');
result.append(s);
result.push_back(')');
forward.push_back(s);
}
// Check reverse iteration results are the reverse of forward results
int matched = 0;
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
ASSERT_LT(matched, forward.size());
ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
matched++;
}
ASSERT_EQ(matched, forward.size());
delete iter;
return result;
}
std::string AllEntriesFor(const Slice& user_key) {
Iterator* iter = dbfull()->TEST_NewInternalIterator();
InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
iter->Seek(target.Encode());
std::string result;
if (!iter->status().ok()) {
result = iter->status().ToString();
} else {
result = "[ ";
bool first = true;
while (iter->Valid()) {
ParsedInternalKey ikey;
if (!ParseInternalKey(iter->key(), &ikey)) {
result += "CORRUPTED";
} else {
if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) {
break;
}
if (!first) {
result += ", ";
}
first = false;
switch (ikey.type) {
case kTypeValue:
result += iter->value().ToString();
break;
case kTypeDeletion:
result += "DEL";
break;
}
}
iter->Next();
}
if (!first) {
result += " ";
}
result += "]";
}
delete iter;
return result;
}
int NumTableFilesAtLevel(int level) {
std::string property;
ASSERT_TRUE(
db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level),
&property));
return atoi(property.c_str());
}
int TotalTableFiles() {
int result = 0;
for (int level = 0; level < config::kNumLevels; level++) {
result += NumTableFilesAtLevel(level);
}
return result;
}
// Return spread of files per level
std::string FilesPerLevel() {
std::string result;
int last_non_zero_offset = 0;
for (int level = 0; level < config::kNumLevels; level++) {
int f = NumTableFilesAtLevel(level);
char buf[100];
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
result += buf;
if (f > 0) {
last_non_zero_offset = result.size();
}
}
result.resize(last_non_zero_offset);
return result;
}
int CountFiles() {
std::vector<std::string> files;
env_->GetChildren(dbname_, &files);
return static_cast<int>(files.size());
}
uint64_t Size(const Slice& start, const Slice& limit) {
Range r(start, limit);
uint64_t size;
db_->GetApproximateSizes(&r, 1, &size);
return size;
}
void Compact(const Slice& start, const Slice& limit) {
db_->CompactRange(&start, &limit);
}
// Do n memtable compactions, each of which produces an sstable
// covering the range [small,large].
void MakeTables(int n, const std::string& small, const std::string& large) {
for (int i = 0; i < n; i++) {
Put(small, "begin");
Put(large, "end");
dbfull()->TEST_CompactMemTable();
}
}
// Prevent pushing of new sstables into deeper levels by adding
// tables that cover a specified range to all levels.
void FillLevels(const std::string& smallest, const std::string& largest) {
MakeTables(config::kNumLevels, smallest, largest);
}
void DumpFileCounts(const char* label) {
fprintf(stderr, "---\n%s:\n", label);
fprintf(stderr, "maxoverlap: %lld\n",
static_cast<long long>(
dbfull()->TEST_MaxNextLevelOverlappingBytes()));
for (int level = 0; level < config::kNumLevels; level++) {
int num = NumTableFilesAtLevel(level);
if (num > 0) {
fprintf(stderr, " level %3d : %d files\n", level, num);
}
}
}
std::string DumpSSTableList() {
std::string property;
db_->GetProperty("leveldb.sstables", &property);
return property;
}
std::string IterStatus(Iterator* iter) {
std::string result;
if (iter->Valid()) {
result = iter->key().ToString() + "->" + iter->value().ToString();
} else {
result = "(invalid)";
}
return result;
}
};
TEST(DBTest, Empty) {
do {
ASSERT_TRUE(db_ != NULL);
ASSERT_EQ("NOT_FOUND", Get("foo"));
} while (ChangeOptions());
}
TEST(DBTest, ReadWrite) {
do {
ASSERT_OK(Put("foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(Put("bar", "v2"));
ASSERT_OK(Put("foo", "v3"));
ASSERT_EQ("v3", Get("foo"));
ASSERT_EQ("v2", Get("bar"));
} while (ChangeOptions());
}
TEST(DBTest, PutDeleteGet) {
do {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
ASSERT_EQ("v2", Get("foo"));
ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
ASSERT_EQ("NOT_FOUND", Get("foo"));
} while (ChangeOptions());
}
TEST(DBTest, GetFromImmutableLayer) {
do {
Options options = CurrentOptions();
options.env = env_;
options.write_buffer_size = 100000; // Small write buffer
Reopen(&options);
ASSERT_OK(Put("foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls
Put("k1", std::string(100000, 'x')); // Fill memtable
Put("k2", std::string(100000, 'y')); // Trigger compaction
ASSERT_EQ("v1", Get("foo"));
env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls
} while (ChangeOptions());
}
TEST(DBTest, GetFromVersions) {
do {
ASSERT_OK(Put("foo", "v1"));
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("v1", Get("foo"));
} while (ChangeOptions());
}
TEST(DBTest, GetSnapshot) {
do {
// Try with both a short key and a long key
for (int i = 0; i < 2; i++) {
std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
ASSERT_OK(Put(key, "v1"));
const Snapshot* s1 = db_->GetSnapshot();
ASSERT_OK(Put(key, "v2"));
ASSERT_EQ("v2", Get(key));
ASSERT_EQ("v1", Get(key, s1));
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("v2", Get(key));
ASSERT_EQ("v1", Get(key, s1));
db_->ReleaseSnapshot(s1);
}
} while (ChangeOptions());
}
TEST(DBTest, GetLevel0Ordering) {
do {
// Check that we process level-0 files in correct order. The code
// below generates two level-0 files where the earlier one comes
// before the later one in the level-0 file list since the earlier
// one has a smaller "smallest" key.
ASSERT_OK(Put("bar", "b"));
ASSERT_OK(Put("foo", "v1"));
dbfull()->TEST_CompactMemTable();
ASSERT_OK(Put("foo", "v2"));
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("v2", Get("foo"));
} while (ChangeOptions());
}
TEST(DBTest, GetOrderedByLevels) {
do {
ASSERT_OK(Put("foo", "v1"));
Compact("a", "z");
ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(Put("foo", "v2"));
ASSERT_EQ("v2", Get("foo"));
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("v2", Get("foo"));
} while (ChangeOptions());
}
TEST(DBTest, GetPicksCorrectFile) {
do {
// Arrange to have multiple files in a non-level-0 level.
ASSERT_OK(Put("a", "va"));
Compact("a", "b");
ASSERT_OK(Put("x", "vx"));
Compact("x", "y");
ASSERT_OK(Put("f", "vf"));
Compact("f", "g");
ASSERT_EQ("va", Get("a"));
ASSERT_EQ("vf", Get("f"));
ASSERT_EQ("vx", Get("x"));
} while (ChangeOptions());
}
TEST(DBTest, GetEncountersEmptyLevel) {
do {
// Arrange for the following to happen:
// * sstable A in level 0
// * nothing in level 1
// * sstable B in level 2
// Then do enough Get() calls to arrange for an automatic compaction
// of sstable A. A bug would cause the compaction to be marked as
// occuring at level 1 (instead of the correct level 0).
// Step 1: First place sstables in levels 0 and 2
int compaction_count = 0;
while (NumTableFilesAtLevel(0) == 0 ||
NumTableFilesAtLevel(2) == 0) {
ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2";
compaction_count++;
Put("a", "begin");
Put("z", "end");
dbfull()->TEST_CompactMemTable();
}
// Step 2: clear level 1 if necessary.
dbfull()->TEST_CompactRange(1, NULL, NULL);
ASSERT_EQ(NumTableFilesAtLevel(0), 1);
ASSERT_EQ(NumTableFilesAtLevel(1), 0);
ASSERT_EQ(NumTableFilesAtLevel(2), 1);
// Step 3: read a bunch of times
for (int i = 0; i < 1000; i++) {
ASSERT_EQ("NOT_FOUND", Get("missing"));
}
// Step 4: Wait for compaction to finish
env_->SleepForMicroseconds(1000000);
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
} while (ChangeOptions());
}
TEST(DBTest, IterEmpty) {
Iterator* iter = db_->NewIterator(ReadOptions());
iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->SeekToLast();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->Seek("foo");
ASSERT_EQ(IterStatus(iter), "(invalid)");
delete iter;
}
TEST(DBTest, IterSingle) {
ASSERT_OK(Put("a", "va"));
Iterator* iter = db_->NewIterator(ReadOptions());
iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Next();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->SeekToLast();
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Next();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->SeekToLast();
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->Seek("");
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Next();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->Seek("a");
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Next();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->Seek("b");
ASSERT_EQ(IterStatus(iter), "(invalid)");
delete iter;
}
TEST(DBTest, IterMulti) {
ASSERT_OK(Put("a", "va"));
ASSERT_OK(Put("b", "vb"));
ASSERT_OK(Put("c", "vc"));
Iterator* iter = db_->NewIterator(ReadOptions());
iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Next();
ASSERT_EQ(IterStatus(iter), "b->vb");
iter->Next();
ASSERT_EQ(IterStatus(iter), "c->vc");
iter->Next();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->SeekToLast();
ASSERT_EQ(IterStatus(iter), "c->vc");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "b->vb");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->SeekToLast();
ASSERT_EQ(IterStatus(iter), "c->vc");
iter->Next();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->Seek("");
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Seek("a");
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Seek("ax");
ASSERT_EQ(IterStatus(iter), "b->vb");
iter->Seek("b");
ASSERT_EQ(IterStatus(iter), "b->vb");
iter->Seek("z");
ASSERT_EQ(IterStatus(iter), "(invalid)");
// Switch from reverse to forward
iter->SeekToLast();
iter->Prev();
iter->Prev();
iter->Next();
ASSERT_EQ(IterStatus(iter), "b->vb");
// Switch from forward to reverse
iter->SeekToFirst();
iter->Next();
iter->Next();
iter->Prev();
ASSERT_EQ(IterStatus(iter), "b->vb");
// Make sure iter stays at snapshot
ASSERT_OK(Put("a", "va2"));
ASSERT_OK(Put("a2", "va3"));
ASSERT_OK(Put("b", "vb2"));
ASSERT_OK(Put("c", "vc2"));
ASSERT_OK(Delete("b"));
iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Next();
ASSERT_EQ(IterStatus(iter), "b->vb");
iter->Next();
ASSERT_EQ(IterStatus(iter), "c->vc");
iter->Next();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->SeekToLast();
ASSERT_EQ(IterStatus(iter), "c->vc");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "b->vb");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "(invalid)");
delete iter;
}
TEST(DBTest, IterSmallAndLargeMix) {
ASSERT_OK(Put("a", "va"));
ASSERT_OK(Put("b", std::string(100000, 'b')));
ASSERT_OK(Put("c", "vc"));
ASSERT_OK(Put("d", std::string(100000, 'd')));
ASSERT_OK(Put("e", std::string(100000, 'e')));
Iterator* iter = db_->NewIterator(ReadOptions());
iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Next();
ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
iter->Next();
ASSERT_EQ(IterStatus(iter), "c->vc");
iter->Next();
ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
iter->Next();
ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
iter->Next();
ASSERT_EQ(IterStatus(iter), "(invalid)");
iter->SeekToLast();
ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
iter->Prev();
ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
iter->Prev();
ASSERT_EQ(IterStatus(iter), "c->vc");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
iter->Prev();
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "(invalid)");
delete iter;
}
TEST(DBTest, IterMultiWithDelete) {
do {
ASSERT_OK(Put("a", "va"));
ASSERT_OK(Put("b", "vb"));
ASSERT_OK(Put("c", "vc"));
ASSERT_OK(Delete("b"));
ASSERT_EQ("NOT_FOUND", Get("b"));
Iterator* iter = db_->NewIterator(ReadOptions());
iter->Seek("c");
ASSERT_EQ(IterStatus(iter), "c->vc");
iter->Prev();
ASSERT_EQ(IterStatus(iter), "a->va");
delete iter;
} while (ChangeOptions());
}
TEST(DBTest, Recover) {
do {
ASSERT_OK(Put("foo", "v1"));
ASSERT_OK(Put("baz", "v5"));
Reopen();
ASSERT_EQ("v1", Get("foo"));
ASSERT_EQ("v1", Get("foo"));
ASSERT_EQ("v5", Get("baz"));
ASSERT_OK(Put("bar", "v2"));
ASSERT_OK(Put("foo", "v3"));
Reopen();
ASSERT_EQ("v3", Get("foo"));
ASSERT_OK(Put("foo", "v4"));
ASSERT_EQ("v4", Get("foo"));
ASSERT_EQ("v2", Get("bar"));
ASSERT_EQ("v5", Get("baz"));
} while (ChangeOptions());
}
TEST(DBTest, RecoveryWithEmptyLog) {
do {
ASSERT_OK(Put("foo", "v1"));
ASSERT_OK(Put("foo", "v2"));
Reopen();
Reopen();
ASSERT_OK(Put("foo", "v3"));
Reopen();
ASSERT_EQ("v3", Get("foo"));
} while (ChangeOptions());
}
// Check that writes done during a memtable compaction are recovered
// if the database is shutdown during the memtable compaction.
TEST(DBTest, RecoverDuringMemtableCompaction) {
do {
Options options = CurrentOptions();
options.env = env_;
options.write_buffer_size = 1000000;
Reopen(&options);
// Trigger a long memtable compaction and reopen the database during it
ASSERT_OK(Put("foo", "v1")); // Goes to 1st log file
ASSERT_OK(Put("big1", std::string(10000000, 'x'))); // Fills memtable
ASSERT_OK(Put("big2", std::string(1000, 'y'))); // Triggers compaction
ASSERT_OK(Put("bar", "v2")); // Goes to new log file
Reopen(&options);
ASSERT_EQ("v1", Get("foo"));
ASSERT_EQ("v2", Get("bar"));
ASSERT_EQ(std::string(10000000, 'x'), Get("big1"));
ASSERT_EQ(std::string(1000, 'y'), Get("big2"));
} while (ChangeOptions());
}
static std::string Key(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "key%06d", i);
return std::string(buf);
}
TEST(DBTest, MinorCompactionsHappen) {
Options options = CurrentOptions();
options.write_buffer_size = 10000;
Reopen(&options);
const int N = 500;
int starting_num_tables = TotalTableFiles();
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v')));
}
int ending_num_tables = TotalTableFiles();
ASSERT_GT(ending_num_tables, starting_num_tables);
for (int i = 0; i < N; i++) {
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
}
Reopen();
for (int i = 0; i < N; i++) {
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
}
}
TEST(DBTest, RecoverWithLargeLog) {
{
Options options = CurrentOptions();
Reopen(&options);
ASSERT_OK(Put("big1", std::string(200000, '1')));
ASSERT_OK(Put("big2", std::string(200000, '2')));
ASSERT_OK(Put("small3", std::string(10, '3')));
ASSERT_OK(Put("small4", std::string(10, '4')));
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
}
// Make sure that if we re-open with a small write buffer size that
// we flush table files in the middle of a large log file.
Options options = CurrentOptions();
options.write_buffer_size = 100000;
Reopen(&options);
ASSERT_EQ(NumTableFilesAtLevel(0), 3);
ASSERT_EQ(std::string(200000, '1'), Get("big1"));
ASSERT_EQ(std::string(200000, '2'), Get("big2"));
ASSERT_EQ(std::string(10, '3'), Get("small3"));
ASSERT_EQ(std::string(10, '4'), Get("small4"));
ASSERT_GT(NumTableFilesAtLevel(0), 1);
}
TEST(DBTest, CompactionsGenerateMultipleFiles) {
Options options = CurrentOptions();
options.write_buffer_size = 100000000; // Large write buffer
Reopen(&options);
Random rnd(301);
// Write 8MB (80 values, each 100K)
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
std::vector<std::string> values;
for (int i = 0; i < 80; i++) {
values.push_back(RandomString(&rnd, 100000));
ASSERT_OK(Put(Key(i), values[i]));
}
// Reopening moves updates to level-0
Reopen(&options);
dbfull()->TEST_CompactRange(0, NULL, NULL);
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GT(NumTableFilesAtLevel(1), 1);
for (int i = 0; i < 80; i++) {
ASSERT_EQ(Get(Key(i)), values[i]);
}
}
TEST(DBTest, RepeatedWritesToSameKey) {
Options options = CurrentOptions();
options.env = env_;
options.write_buffer_size = 100000; // Small write buffer
Reopen(&options);
// We must have at most one file per level except for level-0,
// which may have up to kL0_StopWritesTrigger files.
const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger;
Random rnd(301);
std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
for (int i = 0; i < 5 * kMaxFiles; i++) {
Put("key", value);
ASSERT_LE(TotalTableFiles(), kMaxFiles);
fprintf(stderr, "after %d: %d files\n", int(i+1), TotalTableFiles());
}
}
TEST(DBTest, SparseMerge) {
Options options = CurrentOptions();
options.compression = kNoCompression;
Reopen(&options);
FillLevels("A", "Z");
// Suppose there is:
// small amount of data with prefix A
// large amount of data with prefix B
// small amount of data with prefix C
// and that recent updates have made small changes to all three prefixes.
// Check that we do not do a compaction that merges all of B in one shot.
const std::string value(1000, 'x');
Put("A", "va");
// Write approximately 100MB of "B" values
for (int i = 0; i < 100000; i++) {
char key[100];
snprintf(key, sizeof(key), "B%010d", i);
Put(key, value);
}
Put("C", "vc");
dbfull()->TEST_CompactMemTable();
dbfull()->TEST_CompactRange(0, NULL, NULL);
// Make sparse update
Put("A", "va2");
Put("B100", "bvalue2");
Put("C", "vc2");
dbfull()->TEST_CompactMemTable();
// Compactions should not cause us to create a situation where
// a file overlaps too much data at the next level.
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
dbfull()->TEST_CompactRange(0, NULL, NULL);
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
dbfull()->TEST_CompactRange(1, NULL, NULL);
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
}
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
bool result = (val >= low) && (val <= high);
if (!result) {
fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
(unsigned long long)(val),
(unsigned long long)(low),
(unsigned long long)(high));
}
return result;
}
TEST(DBTest, ApproximateSizes) {
do {
Options options = CurrentOptions();
options.write_buffer_size = 100000000; // Large write buffer
options.compression = kNoCompression;
DestroyAndReopen();
ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
Reopen(&options);
ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
// Write 8MB (80 values, each 100K)
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
const int N = 80;
static const int S1 = 100000;
static const int S2 = 105000; // Allow some expansion from metadata
Random rnd(301);
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), RandomString(&rnd, S1)));
}
// 0 because GetApproximateSizes() does not account for memtable space
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
// Check sizes across recovery by reopening a few times
for (int run = 0; run < 3; run++) {
Reopen(&options);
for (int compact_start = 0; compact_start < N; compact_start += 10) {
for (int i = 0; i < N; i += 10) {
ASSERT_TRUE(Between(Size("", Key(i)), S1*i, S2*i));
ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), S1*(i+1), S2*(i+1)));
ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), S1*10, S2*10));
}
ASSERT_TRUE(Between(Size("", Key(50)), S1*50, S2*50));
ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), S1*50, S2*50));
std::string cstart_str = Key(compact_start);
std::string cend_str = Key(compact_start + 9);
Slice cstart = cstart_str;
Slice cend = cend_str;
dbfull()->TEST_CompactRange(0, &cstart, &cend);
}
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GT(NumTableFilesAtLevel(1), 0);
}
} while (ChangeOptions());
}
TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
do {
Options options = CurrentOptions();
options.compression = kNoCompression;
Reopen();
Random rnd(301);
std::string big1 = RandomString(&rnd, 100000);
ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(2), big1));
ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(4), big1));
ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
// Check sizes across recovery by reopening a few times
for (int run = 0; run < 3; run++) {
Reopen(&options);
ASSERT_TRUE(Between(Size("", Key(0)), 0, 0));
ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000));
ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000));
ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000));
ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000));
ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000));
ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000));
ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000));
ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000));
ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
dbfull()->TEST_CompactRange(0, NULL, NULL);
}
} while (ChangeOptions());
}
TEST(DBTest, IteratorPinsRef) {
Put("foo", "hello");
// Get iterator that will yield the current contents of the DB.
Iterator* iter = db_->NewIterator(ReadOptions());
// Write to force compactions
Put("foo", "newvalue1");
for (int i = 0; i < 100; i++) {
ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values
}
Put("foo", "newvalue2");
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("foo", iter->key().ToString());
ASSERT_EQ("hello", iter->value().ToString());
iter->Next();
ASSERT_TRUE(!iter->Valid());
delete iter;
}
TEST(DBTest, Snapshot) {
do {
Put("foo", "v1");
const Snapshot* s1 = db_->GetSnapshot();
Put("foo", "v2");
const Snapshot* s2 = db_->GetSnapshot();
Put("foo", "v3");
const Snapshot* s3 = db_->GetSnapshot();
Put("foo", "v4");
ASSERT_EQ("v1", Get("foo", s1));
ASSERT_EQ("v2", Get("foo", s2));
ASSERT_EQ("v3", Get("foo", s3));
ASSERT_EQ("v4", Get("foo"));
db_->ReleaseSnapshot(s3);
ASSERT_EQ("v1", Get("foo", s1));
ASSERT_EQ("v2", Get("foo", s2));
ASSERT_EQ("v4", Get("foo"));
db_->ReleaseSnapshot(s1);
ASSERT_EQ("v2", Get("foo", s2));
ASSERT_EQ("v4", Get("foo"));
db_->ReleaseSnapshot(s2);
ASSERT_EQ("v4", Get("foo"));
} while (ChangeOptions());
}
TEST(DBTest, HiddenValuesAreRemoved) {
do {
Random rnd(301);
FillLevels("a", "z");
std::string big = RandomString(&rnd, 50000);
Put("foo", big);
Put("pastfoo", "v");
const Snapshot* snapshot = db_->GetSnapshot();
Put("foo", "tiny");
Put("pastfoo2", "v2"); // Advance sequence number one more
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_GT(NumTableFilesAtLevel(0), 0);
ASSERT_EQ(big, Get("foo", snapshot));
ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000));
db_->ReleaseSnapshot(snapshot);
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]");
Slice x("x");
dbfull()->TEST_CompactRange(0, NULL, &x);
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GE(NumTableFilesAtLevel(1), 1);
dbfull()->TEST_CompactRange(1, NULL, &x);
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
} while (ChangeOptions());
}
TEST(DBTest, DeletionMarkers1) {
Put("foo", "v1");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
const int last = config::kMaxMemCompactLevel;
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
// Place a table at level last-1 to prevent merging with preceding mutation
Put("a", "begin");
Put("z", "end");
dbfull()->TEST_CompactMemTable();
ASSERT_EQ(NumTableFilesAtLevel(last), 1);
ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
Delete("foo");
Put("foo", "v2");
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
Slice z("z");
dbfull()->TEST_CompactRange(last-2, NULL, &z);
// DEL eliminated, but v1 remains because we aren't compacting that level
// (DEL can be eliminated because v2 hides v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
dbfull()->TEST_CompactRange(last-1, NULL, NULL);
// Merging last-1 w/ last, so we are the base level for "foo", so
// DEL is removed. (as is v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
}
TEST(DBTest, DeletionMarkers2) {
Put("foo", "v1");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
const int last = config::kMaxMemCompactLevel;
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
// Place a table at level last-1 to prevent merging with preceding mutation
Put("a", "begin");
Put("z", "end");
dbfull()->TEST_CompactMemTable();
ASSERT_EQ(NumTableFilesAtLevel(last), 1);
ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
Delete("foo");
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(last-2, NULL, NULL);
// DEL kept: "last" file overlaps
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(last-1, NULL, NULL);
// Merging last-1 w/ last, so we are the base level for "foo", so
// DEL is removed. (as is v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
}
TEST(DBTest, OverlapInLevel0) {
do {
ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
// Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
ASSERT_OK(Put("100", "v100"));
ASSERT_OK(Put("999", "v999"));
dbfull()->TEST_CompactMemTable();
ASSERT_OK(Delete("100"));
ASSERT_OK(Delete("999"));
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("0,1,1", FilesPerLevel());
// Make files spanning the following ranges in level-0:
// files[0] 200 .. 900
// files[1] 300 .. 500
// Note that files are sorted by smallest key.
ASSERT_OK(Put("300", "v300"));
ASSERT_OK(Put("500", "v500"));
dbfull()->TEST_CompactMemTable();
ASSERT_OK(Put("200", "v200"));
ASSERT_OK(Put("600", "v600"));
ASSERT_OK(Put("900", "v900"));
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("2,1,1", FilesPerLevel());
// Compact away the placeholder files we created initially
dbfull()->TEST_CompactRange(1, NULL, NULL);
dbfull()->TEST_CompactRange(2, NULL, NULL);
ASSERT_EQ("2", FilesPerLevel());
// Do a memtable compaction. Before bug-fix, the compaction would
// not detect the overlap with level-0 files and would incorrectly place
// the deletion in a deeper level.
ASSERT_OK(Delete("600"));
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("3", FilesPerLevel());
ASSERT_EQ("NOT_FOUND", Get("600"));
} while (ChangeOptions());
}
TEST(DBTest, L0_CompactionBug_Issue44_a) {
Reopen();
ASSERT_OK(Put("b", "v"));
Reopen();
ASSERT_OK(Delete("b"));
ASSERT_OK(Delete("a"));
Reopen();
ASSERT_OK(Delete("a"));
Reopen();
ASSERT_OK(Put("a", "v"));
Reopen();
Reopen();
ASSERT_EQ("(a->v)", Contents());
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
ASSERT_EQ("(a->v)", Contents());
}
TEST(DBTest, L0_CompactionBug_Issue44_b) {
Reopen();
Put("","");
Reopen();
Delete("e");
Put("","");
Reopen();
Put("c", "cv");
Reopen();
Put("","");
Reopen();
Put("","");
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
Reopen();
Put("d","dv");
Reopen();
Put("","");
Reopen();
Delete("d");
Delete("b");
Reopen();
ASSERT_EQ("(->)(c->cv)", Contents());
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
ASSERT_EQ("(->)(c->cv)", Contents());
}
TEST(DBTest, ComparatorCheck) {
class NewComparator : public Comparator {
public:
virtual const char* Name() const { return "leveldb.NewComparator"; }
virtual int Compare(const Slice& a, const Slice& b) const {
return BytewiseComparator()->Compare(a, b);
}
virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
BytewiseComparator()->FindShortestSeparator(s, l);
}
virtual void FindShortSuccessor(std::string* key) const {
BytewiseComparator()->FindShortSuccessor(key);
}
};
NewComparator cmp;
Options new_options = CurrentOptions();
new_options.comparator = &cmp;
Status s = TryReopen(&new_options);
ASSERT_TRUE(!s.ok());
ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
<< s.ToString();
}
TEST(DBTest, CustomComparator) {
class NumberComparator : public Comparator {
public:
virtual const char* Name() const { return "test.NumberComparator"; }
virtual int Compare(const Slice& a, const Slice& b) const {
return ToNumber(a) - ToNumber(b);
}
virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
ToNumber(*s); // Check format
ToNumber(l); // Check format
}
virtual void FindShortSuccessor(std::string* key) const {
ToNumber(*key); // Check format
}
private:
static int ToNumber(const Slice& x) {
// Check that there are no extra characters.
ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size()-1] == ']')
<< EscapeString(x);
int val;
char ignored;
ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
<< EscapeString(x);
return val;
}
};
NumberComparator cmp;
Options new_options = CurrentOptions();
new_options.create_if_missing = true;
new_options.comparator = &cmp;
new_options.filter_policy = NULL; // Cannot use bloom filters
new_options.write_buffer_size = 1000; // Compact more often
DestroyAndReopen(&new_options);
ASSERT_OK(Put("[10]", "ten"));
ASSERT_OK(Put("[0x14]", "twenty"));
for (int i = 0; i < 2; i++) {
ASSERT_EQ("ten", Get("[10]"));
ASSERT_EQ("ten", Get("[0xa]"));
ASSERT_EQ("twenty", Get("[20]"));
ASSERT_EQ("twenty", Get("[0x14]"));
ASSERT_EQ("NOT_FOUND", Get("[15]"));
ASSERT_EQ("NOT_FOUND", Get("[0xf]"));
Compact("[0]", "[9999]");
}
for (int run = 0; run < 2; run++) {
for (int i = 0; i < 1000; i++) {
char buf[100];
snprintf(buf, sizeof(buf), "[%d]", i*10);
ASSERT_OK(Put(buf, buf));
}
Compact("[0]", "[1000000]");
}
}
TEST(DBTest, ManualCompaction) {
ASSERT_EQ(config::kMaxMemCompactLevel, 2)
<< "Need to update this test to match kMaxMemCompactLevel";
MakeTables(3, "p", "q");
ASSERT_EQ("1,1,1", FilesPerLevel());
// Compaction range falls before files
Compact("", "c");
ASSERT_EQ("1,1,1", FilesPerLevel());
// Compaction range falls after files
Compact("r", "z");
ASSERT_EQ("1,1,1", FilesPerLevel());
// Compaction range overlaps files
Compact("p1", "p9");
ASSERT_EQ("0,0,1", FilesPerLevel());
// Populate a different range
MakeTables(3, "c", "e");
ASSERT_EQ("1,1,2", FilesPerLevel());
// Compact just the new range
Compact("b", "f");
ASSERT_EQ("0,0,2", FilesPerLevel());
// Compact all
MakeTables(1, "a", "z");
ASSERT_EQ("0,1,2", FilesPerLevel());
db_->CompactRange(NULL, NULL);
ASSERT_EQ("0,0,1", FilesPerLevel());
}
TEST(DBTest, DBOpen_Options) {
std::string dbname = test::TmpDir() + "/db_options_test";
DestroyDB(dbname, Options());
// Does not exist, and create_if_missing == false: error
DB* db = NULL;
Options opts;
opts.create_if_missing = false;
Status s = DB::Open(opts, dbname, &db);
ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL);
ASSERT_TRUE(db == NULL);
// Does not exist, and create_if_missing == true: OK
opts.create_if_missing = true;
s = DB::Open(opts, dbname, &db);
ASSERT_OK(s);
ASSERT_TRUE(db != NULL);
delete db;
db = NULL;
// Does exist, and error_if_exists == true: error
opts.create_if_missing = false;
opts.error_if_exists = true;
s = DB::Open(opts, dbname, &db);
ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL);
ASSERT_TRUE(db == NULL);
// Does exist, and error_if_exists == false: OK
opts.create_if_missing = true;
opts.error_if_exists = false;
s = DB::Open(opts, dbname, &db);
ASSERT_OK(s);
ASSERT_TRUE(db != NULL);
delete db;
db = NULL;
}
+TEST(DBTest, Locking) {
+ DB* db2 = NULL;
+ Status s = DB::Open(CurrentOptions(), dbname_, &db2);
+ ASSERT_TRUE(!s.ok()) << "Locking did not prevent re-opening db";
+}
+
// Check that number of files does not grow when we are out of space
TEST(DBTest, NoSpace) {
Options options = CurrentOptions();
options.env = env_;
Reopen(&options);
ASSERT_OK(Put("foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
Compact("a", "z");
const int num_files = CountFiles();
env_->no_space_.Release_Store(env_); // Force out-of-space errors
env_->sleep_counter_.Reset();
for (int i = 0; i < 5; i++) {
for (int level = 0; level < config::kNumLevels-1; level++) {
dbfull()->TEST_CompactRange(level, NULL, NULL);
}
}
env_->no_space_.Release_Store(NULL);
ASSERT_LT(CountFiles(), num_files + 3);
// Check that compaction attempts slept after errors
ASSERT_GE(env_->sleep_counter_.Read(), 5);
}
TEST(DBTest, NonWritableFileSystem) {
Options options = CurrentOptions();
options.write_buffer_size = 1000;
options.env = env_;
Reopen(&options);
ASSERT_OK(Put("foo", "v1"));
env_->non_writable_.Release_Store(env_); // Force errors for new files
std::string big(100000, 'x');
int errors = 0;
for (int i = 0; i < 20; i++) {
fprintf(stderr, "iter %d; errors %d\n", i, errors);
if (!Put("foo", big).ok()) {
errors++;
env_->SleepForMicroseconds(100000);
}
}
ASSERT_GT(errors, 0);
env_->non_writable_.Release_Store(NULL);
}
TEST(DBTest, FilesDeletedAfterCompaction) {
ASSERT_OK(Put("foo", "v2"));
Compact("a", "z");
const int num_files = CountFiles();
for (int i = 0; i < 10; i++) {
ASSERT_OK(Put("foo", "v2"));
Compact("a", "z");
}
ASSERT_EQ(CountFiles(), num_files);
}
TEST(DBTest, BloomFilter) {
env_->count_random_reads_ = true;
Options options = CurrentOptions();
options.env = env_;
options.block_cache = NewLRUCache(0); // Prevent cache hits
options.filter_policy = NewBloomFilterPolicy(10);
Reopen(&options);
// Populate multiple layers
const int N = 10000;
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), Key(i)));
}
Compact("a", "z");
for (int i = 0; i < N; i += 100) {
ASSERT_OK(Put(Key(i), Key(i)));
}
dbfull()->TEST_CompactMemTable();
// Prevent auto compactions triggered by seeks
env_->delay_sstable_sync_.Release_Store(env_);
// Lookup present keys. Should rarely read from small sstable.
env_->random_read_counter_.Reset();
for (int i = 0; i < N; i++) {
ASSERT_EQ(Key(i), Get(Key(i)));
}
int reads = env_->random_read_counter_.Read();
fprintf(stderr, "%d present => %d reads\n", N, reads);
ASSERT_GE(reads, N);
ASSERT_LE(reads, N + 2*N/100);
// Lookup present keys. Should rarely read from either sstable.
env_->random_read_counter_.Reset();
for (int i = 0; i < N; i++) {
ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing"));
}
reads = env_->random_read_counter_.Read();
fprintf(stderr, "%d missing => %d reads\n", N, reads);
ASSERT_LE(reads, 3*N/100);
env_->delay_sstable_sync_.Release_Store(NULL);
Close();
delete options.block_cache;
delete options.filter_policy;
}
// Multi-threaded test:
namespace {
static const int kNumThreads = 4;
static const int kTestSeconds = 10;
static const int kNumKeys = 1000;
struct MTState {
DBTest* test;
port::AtomicPointer stop;
port::AtomicPointer counter[kNumThreads];
port::AtomicPointer thread_done[kNumThreads];
};
struct MTThread {
MTState* state;
int id;
};
static void MTThreadBody(void* arg) {
MTThread* t = reinterpret_cast<MTThread*>(arg);
int id = t->id;
DB* db = t->state->test->db_;
uintptr_t counter = 0;
fprintf(stderr, "... starting thread %d\n", id);
Random rnd(1000 + id);
std::string value;
char valbuf[1500];
while (t->state->stop.Acquire_Load() == NULL) {
t->state->counter[id].Release_Store(reinterpret_cast<void*>(counter));
int key = rnd.Uniform(kNumKeys);
char keybuf[20];
snprintf(keybuf, sizeof(keybuf), "%016d", key);
if (rnd.OneIn(2)) {
// Write values of the form <key, my id, counter>.
// We add some padding for force compactions.
snprintf(valbuf, sizeof(valbuf), "%d.%d.%-1000d",
key, id, static_cast<int>(counter));
ASSERT_OK(db->Put(WriteOptions(), Slice(keybuf), Slice(valbuf)));
} else {
// Read a value and verify that it matches the pattern written above.
Status s = db->Get(ReadOptions(), Slice(keybuf), &value);
if (s.IsNotFound()) {
// Key has not yet been written
} else {
// Check that the writer thread counter is >= the counter in the value
ASSERT_OK(s);
int k, w, c;
ASSERT_EQ(3, sscanf(value.c_str(), "%d.%d.%d", &k, &w, &c)) << value;
ASSERT_EQ(k, key);
ASSERT_GE(w, 0);
ASSERT_LT(w, kNumThreads);
ASSERT_LE(c, reinterpret_cast<uintptr_t>(
t->state->counter[w].Acquire_Load()));
}
}
counter++;
}
t->state->thread_done[id].Release_Store(t);
fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
}
} // namespace
TEST(DBTest, MultiThreaded) {
do {
// Initialize state
MTState mt;
mt.test = this;
mt.stop.Release_Store(0);
for (int id = 0; id < kNumThreads; id++) {
mt.counter[id].Release_Store(0);
mt.thread_done[id].Release_Store(0);
}
// Start threads
MTThread thread[kNumThreads];
for (int id = 0; id < kNumThreads; id++) {
thread[id].state = &mt;
thread[id].id = id;
env_->StartThread(MTThreadBody, &thread[id]);
}
// Let them run for a while
env_->SleepForMicroseconds(kTestSeconds * 1000000);
// Stop the threads and wait for them to finish
mt.stop.Release_Store(&mt);
for (int id = 0; id < kNumThreads; id++) {
while (mt.thread_done[id].Acquire_Load() == NULL) {
env_->SleepForMicroseconds(100000);
}
}
} while (ChangeOptions());
}
namespace {
typedef std::map<std::string, std::string> KVMap;
}
class ModelDB: public DB {
public:
class ModelSnapshot : public Snapshot {
public:
KVMap map_;
};
explicit ModelDB(const Options& options): options_(options) { }
~ModelDB() { }
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
return DB::Put(o, k, v);
}
virtual Status Delete(const WriteOptions& o, const Slice& key) {
return DB::Delete(o, key);
}
virtual Status Get(const ReadOptions& options,
const Slice& key, std::string* value) {
assert(false); // Not implemented
return Status::NotFound(key);
}
virtual Iterator* NewIterator(const ReadOptions& options) {
if (options.snapshot == NULL) {
KVMap* saved = new KVMap;
*saved = map_;
return new ModelIter(saved, true);
} else {
const KVMap* snapshot_state =
&(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
return new ModelIter(snapshot_state, false);
}
}
virtual const Snapshot* GetSnapshot() {
ModelSnapshot* snapshot = new ModelSnapshot;
snapshot->map_ = map_;
return snapshot;
}
virtual void ReleaseSnapshot(const Snapshot* snapshot) {
delete reinterpret_cast<const ModelSnapshot*>(snapshot);
}
virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
class Handler : public WriteBatch::Handler {
public:
KVMap* map_;
virtual void Put(const Slice& key, const Slice& value) {
(*map_)[key.ToString()] = value.ToString();
}
virtual void Delete(const Slice& key) {
map_->erase(key.ToString());
}
};
Handler handler;
handler.map_ = &map_;
return batch->Iterate(&handler);
}
virtual bool GetProperty(const Slice& property, std::string* value) {
return false;
}
virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) {
for (int i = 0; i < n; i++) {
sizes[i] = 0;
}
}
virtual void CompactRange(const Slice* start, const Slice* end) {
}
private:
class ModelIter: public Iterator {
public:
ModelIter(const KVMap* map, bool owned)
: map_(map), owned_(owned), iter_(map_->end()) {
}
~ModelIter() {
if (owned_) delete map_;
}
virtual bool Valid() const { return iter_ != map_->end(); }
virtual void SeekToFirst() { iter_ = map_->begin(); }
virtual void SeekToLast() {
if (map_->empty()) {
iter_ = map_->end();
} else {
iter_ = map_->find(map_->rbegin()->first);
}
}
virtual void Seek(const Slice& k) {
iter_ = map_->lower_bound(k.ToString());
}
virtual void Next() { ++iter_; }
virtual void Prev() { --iter_; }
virtual Slice key() const { return iter_->first; }
virtual Slice value() const { return iter_->second; }
virtual Status status() const { return Status::OK(); }
private:
const KVMap* const map_;
const bool owned_; // Do we own map_
KVMap::const_iterator iter_;
};
const Options options_;
KVMap map_;
};
static std::string RandomKey(Random* rnd) {
int len = (rnd->OneIn(3)
? 1 // Short sometimes to encourage collisions
: (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
return test::RandomKey(rnd, len);
}
static bool CompareIterators(int step,
DB* model,
DB* db,
const Snapshot* model_snap,
const Snapshot* db_snap) {
ReadOptions options;
options.snapshot = model_snap;
Iterator* miter = model->NewIterator(options);
options.snapshot = db_snap;
Iterator* dbiter = db->NewIterator(options);
bool ok = true;
int count = 0;
for (miter->SeekToFirst(), dbiter->SeekToFirst();
ok && miter->Valid() && dbiter->Valid();
miter->Next(), dbiter->Next()) {
count++;
if (miter->key().compare(dbiter->key()) != 0) {
fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n",
step,
EscapeString(miter->key()).c_str(),
EscapeString(dbiter->key()).c_str());
ok = false;
break;
}
if (miter->value().compare(dbiter->value()) != 0) {
fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
step,
EscapeString(miter->key()).c_str(),
EscapeString(miter->value()).c_str(),
EscapeString(miter->value()).c_str());
ok = false;
}
}
if (ok) {
if (miter->Valid() != dbiter->Valid()) {
fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
step, miter->Valid(), dbiter->Valid());
ok = false;
}
}
fprintf(stderr, "%d entries compared: ok=%d\n", count, ok);
delete miter;
delete dbiter;
return ok;
}
TEST(DBTest, Randomized) {
Random rnd(test::RandomSeed());
do {
ModelDB model(CurrentOptions());
const int N = 10000;
const Snapshot* model_snap = NULL;
const Snapshot* db_snap = NULL;
std::string k, v;
for (int step = 0; step < N; step++) {
if (step % 100 == 0) {
fprintf(stderr, "Step %d of %d\n", step, N);
}
// TODO(sanjay): Test Get() works
int p = rnd.Uniform(100);
if (p < 45) { // Put
k = RandomKey(&rnd);
v = RandomString(&rnd,
rnd.OneIn(20)
? 100 + rnd.Uniform(100)
: rnd.Uniform(8));
ASSERT_OK(model.Put(WriteOptions(), k, v));
ASSERT_OK(db_->Put(WriteOptions(), k, v));
} else if (p < 90) { // Delete
k = RandomKey(&rnd);
ASSERT_OK(model.Delete(WriteOptions(), k));
ASSERT_OK(db_->Delete(WriteOptions(), k));
} else { // Multi-element batch
WriteBatch b;
const int num = rnd.Uniform(8);
for (int i = 0; i < num; i++) {
if (i == 0 || !rnd.OneIn(10)) {
k = RandomKey(&rnd);
} else {
// Periodically re-use the same key from the previous iter, so
// we have multiple entries in the write batch for the same key
}
if (rnd.OneIn(2)) {
v = RandomString(&rnd, rnd.Uniform(10));
b.Put(k, v);
} else {
b.Delete(k);
}
}
ASSERT_OK(model.Write(WriteOptions(), &b));
ASSERT_OK(db_->Write(WriteOptions(), &b));
}
if ((step % 100) == 0) {
ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
// Save a snapshot from each DB this time that we'll use next
// time we compare things, to make sure the current state is
// preserved with the snapshot
if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
Reopen();
ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
model_snap = model.GetSnapshot();
db_snap = db_->GetSnapshot();
}
}
if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
} while (ChangeOptions());
}
std::string MakeKey(unsigned int num) {
char buf[30];
snprintf(buf, sizeof(buf), "%016u", num);
return std::string(buf);
}
void BM_LogAndApply(int iters, int num_base_files) {
std::string dbname = test::TmpDir() + "/leveldb_test_benchmark";
DestroyDB(dbname, Options());
DB* db = NULL;
Options opts;
opts.create_if_missing = true;
Status s = DB::Open(opts, dbname, &db);
ASSERT_OK(s);
ASSERT_TRUE(db != NULL);
delete db;
db = NULL;
Env* env = Env::Default();
port::Mutex mu;
MutexLock l(&mu);
InternalKeyComparator cmp(BytewiseComparator());
Options options;
VersionSet vset(dbname, &options, NULL, &cmp);
ASSERT_OK(vset.Recover());
VersionEdit vbase;
uint64_t fnum = 1;
for (int i = 0; i < num_base_files; i++) {
InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
}
ASSERT_OK(vset.LogAndApply(&vbase, &mu));
uint64_t start_micros = env->NowMicros();
for (int i = 0; i < iters; i++) {
VersionEdit vedit;
vedit.DeleteFile(2, fnum);
InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
vset.LogAndApply(&vedit, &mu);
}
uint64_t stop_micros = env->NowMicros();
unsigned int us = stop_micros - start_micros;
char buf[16];
snprintf(buf, sizeof(buf), "%d", num_base_files);
fprintf(stderr,
"BM_LogAndApply/%-6s %8d iters : %9u us (%7.0f us / iter)\n",
buf, iters, us, ((float)us) / iters);
}
} // namespace leveldb
int main(int argc, char** argv) {
if (argc > 1 && std::string(argv[1]) == "--benchmark") {
leveldb::BM_LogAndApply(1000, 1);
leveldb::BM_LogAndApply(1000, 100);
leveldb::BM_LogAndApply(1000, 10000);
leveldb::BM_LogAndApply(100, 100000);
return 0;
}
return leveldb::test::RunAllTests();
}
diff --git a/src/leveldb/db/version_set.cc b/src/leveldb/db/version_set.cc
index cf976b437..bdd4a579b 100644
--- a/src/leveldb/db/version_set.cc
+++ b/src/leveldb/db/version_set.cc
@@ -1,1402 +1,1402 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_set.h"
#include <algorithm>
#include <stdio.h>
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "leveldb/env.h"
#include "leveldb/table_builder.h"
#include "table/merger.h"
#include "table/two_level_iterator.h"
#include "util/coding.h"
#include "util/logging.h"
namespace leveldb {
static const int kTargetFileSize = 2 * 1048576;
// Maximum bytes of overlaps in grandparent (i.e., level+2) before we
// stop building a single file in a level->level+1 compaction.
static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize;
// Maximum number of bytes in all compacted files. We avoid expanding
// the lower level file set of a compaction if it would make the
// total compaction cover more than this many bytes.
static const int64_t kExpandedCompactionByteSizeLimit = 25 * kTargetFileSize;
static double MaxBytesForLevel(int level) {
// Note: the result for level zero is not really used since we set
// the level-0 compaction threshold based on number of files.
double result = 10 * 1048576.0; // Result for both level-0 and level-1
while (level > 1) {
result *= 10;
level--;
}
return result;
}
static uint64_t MaxFileSizeForLevel(int level) {
return kTargetFileSize; // We could vary per level to reduce number of files?
}
static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
int64_t sum = 0;
for (size_t i = 0; i < files.size(); i++) {
sum += files[i]->file_size;
}
return sum;
}
namespace {
std::string IntSetToString(const std::set<uint64_t>& s) {
std::string result = "{";
for (std::set<uint64_t>::const_iterator it = s.begin();
it != s.end();
++it) {
result += (result.size() > 1) ? "," : "";
result += NumberToString(*it);
}
result += "}";
return result;
}
} // namespace
Version::~Version() {
assert(refs_ == 0);
// Remove from linked list
prev_->next_ = next_;
next_->prev_ = prev_;
// Drop references to files
for (int level = 0; level < config::kNumLevels; level++) {
for (size_t i = 0; i < files_[level].size(); i++) {
FileMetaData* f = files_[level][i];
assert(f->refs > 0);
f->refs--;
if (f->refs <= 0) {
delete f;
}
}
}
}
int FindFile(const InternalKeyComparator& icmp,
const std::vector<FileMetaData*>& files,
const Slice& key) {
uint32_t left = 0;
uint32_t right = files.size();
while (left < right) {
uint32_t mid = (left + right) / 2;
const FileMetaData* f = files[mid];
if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
// Key at "mid.largest" is < "target". Therefore all
// files at or before "mid" are uninteresting.
left = mid + 1;
} else {
// Key at "mid.largest" is >= "target". Therefore all files
// after "mid" are uninteresting.
right = mid;
}
}
return right;
}
static bool AfterFile(const Comparator* ucmp,
const Slice* user_key, const FileMetaData* f) {
// NULL user_key occurs before all keys and is therefore never after *f
return (user_key != NULL &&
ucmp->Compare(*user_key, f->largest.user_key()) > 0);
}
static bool BeforeFile(const Comparator* ucmp,
const Slice* user_key, const FileMetaData* f) {
// NULL user_key occurs after all keys and is therefore never before *f
return (user_key != NULL &&
ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
}
bool SomeFileOverlapsRange(
const InternalKeyComparator& icmp,
bool disjoint_sorted_files,
const std::vector<FileMetaData*>& files,
const Slice* smallest_user_key,
const Slice* largest_user_key) {
const Comparator* ucmp = icmp.user_comparator();
if (!disjoint_sorted_files) {
// Need to check against all files
for (size_t i = 0; i < files.size(); i++) {
const FileMetaData* f = files[i];
if (AfterFile(ucmp, smallest_user_key, f) ||
BeforeFile(ucmp, largest_user_key, f)) {
// No overlap
} else {
return true; // Overlap
}
}
return false;
}
// Binary search over file list
uint32_t index = 0;
if (smallest_user_key != NULL) {
// Find the earliest possible internal key for smallest_user_key
InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
index = FindFile(icmp, files, small.Encode());
}
if (index >= files.size()) {
// beginning of range is after all files, so no overlap.
return false;
}
return !BeforeFile(ucmp, largest_user_key, files[index]);
}
// An internal iterator. For a given version/level pair, yields
// information about the files in the level. For a given entry, key()
// is the largest key that occurs in the file, and value() is an
// 16-byte value containing the file number and file size, both
// encoded using EncodeFixed64.
class Version::LevelFileNumIterator : public Iterator {
public:
LevelFileNumIterator(const InternalKeyComparator& icmp,
const std::vector<FileMetaData*>* flist)
: icmp_(icmp),
flist_(flist),
index_(flist->size()) { // Marks as invalid
}
virtual bool Valid() const {
return index_ < flist_->size();
}
virtual void Seek(const Slice& target) {
index_ = FindFile(icmp_, *flist_, target);
}
virtual void SeekToFirst() { index_ = 0; }
virtual void SeekToLast() {
index_ = flist_->empty() ? 0 : flist_->size() - 1;
}
virtual void Next() {
assert(Valid());
index_++;
}
virtual void Prev() {
assert(Valid());
if (index_ == 0) {
index_ = flist_->size(); // Marks as invalid
} else {
index_--;
}
}
Slice key() const {
assert(Valid());
return (*flist_)[index_]->largest.Encode();
}
Slice value() const {
assert(Valid());
EncodeFixed64(value_buf_, (*flist_)[index_]->number);
EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size);
return Slice(value_buf_, sizeof(value_buf_));
}
virtual Status status() const { return Status::OK(); }
private:
const InternalKeyComparator icmp_;
const std::vector<FileMetaData*>* const flist_;
uint32_t index_;
// Backing store for value(). Holds the file number and size.
mutable char value_buf_[16];
};
static Iterator* GetFileIterator(void* arg,
const ReadOptions& options,
const Slice& file_value) {
TableCache* cache = reinterpret_cast<TableCache*>(arg);
if (file_value.size() != 16) {
return NewErrorIterator(
Status::Corruption("FileReader invoked with unexpected value"));
} else {
return cache->NewIterator(options,
DecodeFixed64(file_value.data()),
DecodeFixed64(file_value.data() + 8));
}
}
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
int level) const {
return NewTwoLevelIterator(
new LevelFileNumIterator(vset_->icmp_, &files_[level]),
&GetFileIterator, vset_->table_cache_, options);
}
void Version::AddIterators(const ReadOptions& options,
std::vector<Iterator*>* iters) {
// Merge all level zero files together since they may overlap
for (size_t i = 0; i < files_[0].size(); i++) {
iters->push_back(
vset_->table_cache_->NewIterator(
options, files_[0][i]->number, files_[0][i]->file_size));
}
// For levels > 0, we can use a concatenating iterator that sequentially
// walks through the non-overlapping files in the level, opening them
// lazily.
for (int level = 1; level < config::kNumLevels; level++) {
if (!files_[level].empty()) {
iters->push_back(NewConcatenatingIterator(options, level));
}
}
}
// Callback from TableCache::Get()
namespace {
enum SaverState {
kNotFound,
kFound,
kDeleted,
kCorrupt,
};
struct Saver {
SaverState state;
const Comparator* ucmp;
Slice user_key;
std::string* value;
};
}
static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
Saver* s = reinterpret_cast<Saver*>(arg);
ParsedInternalKey parsed_key;
if (!ParseInternalKey(ikey, &parsed_key)) {
s->state = kCorrupt;
} else {
if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
if (s->state == kFound) {
s->value->assign(v.data(), v.size());
}
}
}
}
static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
return a->number > b->number;
}
Status Version::Get(const ReadOptions& options,
const LookupKey& k,
std::string* value,
GetStats* stats) {
Slice ikey = k.internal_key();
Slice user_key = k.user_key();
const Comparator* ucmp = vset_->icmp_.user_comparator();
Status s;
stats->seek_file = NULL;
stats->seek_file_level = -1;
FileMetaData* last_file_read = NULL;
int last_file_read_level = -1;
// We can search level-by-level since entries never hop across
// levels. Therefore we are guaranteed that if we find data
// in an smaller level, later levels are irrelevant.
std::vector<FileMetaData*> tmp;
FileMetaData* tmp2;
for (int level = 0; level < config::kNumLevels; level++) {
size_t num_files = files_[level].size();
if (num_files == 0) continue;
// Get the list of files to search in this level
FileMetaData* const* files = &files_[level][0];
if (level == 0) {
// Level-0 files may overlap each other. Find all files that
// overlap user_key and process them in order from newest to oldest.
tmp.reserve(num_files);
for (uint32_t i = 0; i < num_files; i++) {
FileMetaData* f = files[i];
if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
tmp.push_back(f);
}
}
if (tmp.empty()) continue;
std::sort(tmp.begin(), tmp.end(), NewestFirst);
files = &tmp[0];
num_files = tmp.size();
} else {
// Binary search to find earliest index whose largest key >= ikey.
uint32_t index = FindFile(vset_->icmp_, files_[level], ikey);
if (index >= num_files) {
files = NULL;
num_files = 0;
} else {
tmp2 = files[index];
if (ucmp->Compare(user_key, tmp2->smallest.user_key()) < 0) {
// All of "tmp2" is past any data for user_key
files = NULL;
num_files = 0;
} else {
files = &tmp2;
num_files = 1;
}
}
}
for (uint32_t i = 0; i < num_files; ++i) {
if (last_file_read != NULL && stats->seek_file == NULL) {
// We have had more than one seek for this read. Charge the 1st file.
stats->seek_file = last_file_read;
stats->seek_file_level = last_file_read_level;
}
FileMetaData* f = files[i];
last_file_read = f;
last_file_read_level = level;
Saver saver;
saver.state = kNotFound;
saver.ucmp = ucmp;
saver.user_key = user_key;
saver.value = value;
s = vset_->table_cache_->Get(options, f->number, f->file_size,
ikey, &saver, SaveValue);
if (!s.ok()) {
return s;
}
switch (saver.state) {
case kNotFound:
break; // Keep searching in other files
case kFound:
return s;
case kDeleted:
s = Status::NotFound(Slice()); // Use empty error message for speed
return s;
case kCorrupt:
s = Status::Corruption("corrupted key for ", user_key);
return s;
}
}
}
return Status::NotFound(Slice()); // Use an empty error message for speed
}
bool Version::UpdateStats(const GetStats& stats) {
FileMetaData* f = stats.seek_file;
if (f != NULL) {
f->allowed_seeks--;
if (f->allowed_seeks <= 0 && file_to_compact_ == NULL) {
file_to_compact_ = f;
file_to_compact_level_ = stats.seek_file_level;
return true;
}
}
return false;
}
void Version::Ref() {
++refs_;
}
void Version::Unref() {
assert(this != &vset_->dummy_versions_);
assert(refs_ >= 1);
--refs_;
if (refs_ == 0) {
delete this;
}
}
bool Version::OverlapInLevel(int level,
const Slice* smallest_user_key,
const Slice* largest_user_key) {
return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
smallest_user_key, largest_user_key);
}
int Version::PickLevelForMemTableOutput(
const Slice& smallest_user_key,
const Slice& largest_user_key) {
int level = 0;
if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
// Push to next level if there is no overlap in next level,
// and the #bytes overlapping in the level after that are limited.
InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
std::vector<FileMetaData*> overlaps;
while (level < config::kMaxMemCompactLevel) {
if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
break;
}
GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
const int64_t sum = TotalFileSize(overlaps);
if (sum > kMaxGrandParentOverlapBytes) {
break;
}
level++;
}
}
return level;
}
// Store in "*inputs" all files in "level" that overlap [begin,end]
void Version::GetOverlappingInputs(
int level,
const InternalKey* begin,
const InternalKey* end,
std::vector<FileMetaData*>* inputs) {
inputs->clear();
Slice user_begin, user_end;
if (begin != NULL) {
user_begin = begin->user_key();
}
if (end != NULL) {
user_end = end->user_key();
}
const Comparator* user_cmp = vset_->icmp_.user_comparator();
for (size_t i = 0; i < files_[level].size(); ) {
FileMetaData* f = files_[level][i++];
const Slice file_start = f->smallest.user_key();
const Slice file_limit = f->largest.user_key();
if (begin != NULL && user_cmp->Compare(file_limit, user_begin) < 0) {
// "f" is completely before specified range; skip it
} else if (end != NULL && user_cmp->Compare(file_start, user_end) > 0) {
// "f" is completely after specified range; skip it
} else {
inputs->push_back(f);
if (level == 0) {
// Level-0 files may overlap each other. So check if the newly
// added file has expanded the range. If so, restart search.
if (begin != NULL && user_cmp->Compare(file_start, user_begin) < 0) {
user_begin = file_start;
inputs->clear();
i = 0;
} else if (end != NULL && user_cmp->Compare(file_limit, user_end) > 0) {
user_end = file_limit;
inputs->clear();
i = 0;
}
}
}
}
}
std::string Version::DebugString() const {
std::string r;
for (int level = 0; level < config::kNumLevels; level++) {
// E.g.,
// --- level 1 ---
// 17:123['a' .. 'd']
// 20:43['e' .. 'g']
r.append("--- level ");
AppendNumberTo(&r, level);
r.append(" ---\n");
const std::vector<FileMetaData*>& files = files_[level];
for (size_t i = 0; i < files.size(); i++) {
r.push_back(' ');
AppendNumberTo(&r, files[i]->number);
r.push_back(':');
AppendNumberTo(&r, files[i]->file_size);
r.append("[");
r.append(files[i]->smallest.DebugString());
r.append(" .. ");
r.append(files[i]->largest.DebugString());
r.append("]\n");
}
}
return r;
}
// A helper class so we can efficiently apply a whole sequence
// of edits to a particular state without creating intermediate
// Versions that contain full copies of the intermediate state.
class VersionSet::Builder {
private:
// Helper to sort by v->files_[file_number].smallest
struct BySmallestKey {
const InternalKeyComparator* internal_comparator;
bool operator()(FileMetaData* f1, FileMetaData* f2) const {
int r = internal_comparator->Compare(f1->smallest, f2->smallest);
if (r != 0) {
return (r < 0);
} else {
// Break ties by file number
return (f1->number < f2->number);
}
}
};
typedef std::set<FileMetaData*, BySmallestKey> FileSet;
struct LevelState {
std::set<uint64_t> deleted_files;
FileSet* added_files;
};
VersionSet* vset_;
Version* base_;
LevelState levels_[config::kNumLevels];
public:
// Initialize a builder with the files from *base and other info from *vset
Builder(VersionSet* vset, Version* base)
: vset_(vset),
base_(base) {
base_->Ref();
BySmallestKey cmp;
cmp.internal_comparator = &vset_->icmp_;
for (int level = 0; level < config::kNumLevels; level++) {
levels_[level].added_files = new FileSet(cmp);
}
}
~Builder() {
for (int level = 0; level < config::kNumLevels; level++) {
const FileSet* added = levels_[level].added_files;
std::vector<FileMetaData*> to_unref;
to_unref.reserve(added->size());
for (FileSet::const_iterator it = added->begin();
it != added->end(); ++it) {
to_unref.push_back(*it);
}
delete added;
for (uint32_t i = 0; i < to_unref.size(); i++) {
FileMetaData* f = to_unref[i];
f->refs--;
if (f->refs <= 0) {
delete f;
}
}
}
base_->Unref();
}
// Apply all of the edits in *edit to the current state.
void Apply(VersionEdit* edit) {
// Update compaction pointers
for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
const int level = edit->compact_pointers_[i].first;
vset_->compact_pointer_[level] =
edit->compact_pointers_[i].second.Encode().ToString();
}
// Delete files
const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
iter != del.end();
++iter) {
const int level = iter->first;
const uint64_t number = iter->second;
levels_[level].deleted_files.insert(number);
}
// Add new files
for (size_t i = 0; i < edit->new_files_.size(); i++) {
const int level = edit->new_files_[i].first;
FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
f->refs = 1;
// We arrange to automatically compact this file after
// a certain number of seeks. Let's assume:
// (1) One seek costs 10ms
// (2) Writing or reading 1MB costs 10ms (100MB/s)
// (3) A compaction of 1MB does 25MB of IO:
// 1MB read from this level
// 10-12MB read from next level (boundaries may be misaligned)
// 10-12MB written to next level
// This implies that 25 seeks cost the same as the compaction
// of 1MB of data. I.e., one seek costs approximately the
// same as the compaction of 40KB of data. We are a little
// conservative and allow approximately one seek for every 16KB
// of data before triggering a compaction.
f->allowed_seeks = (f->file_size / 16384);
if (f->allowed_seeks < 100) f->allowed_seeks = 100;
levels_[level].deleted_files.erase(f->number);
levels_[level].added_files->insert(f);
}
}
// Save the current state in *v.
void SaveTo(Version* v) {
BySmallestKey cmp;
cmp.internal_comparator = &vset_->icmp_;
for (int level = 0; level < config::kNumLevels; level++) {
// Merge the set of added files with the set of pre-existing files.
// Drop any deleted files. Store the result in *v.
const std::vector<FileMetaData*>& base_files = base_->files_[level];
std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
const FileSet* added = levels_[level].added_files;
v->files_[level].reserve(base_files.size() + added->size());
for (FileSet::const_iterator added_iter = added->begin();
added_iter != added->end();
++added_iter) {
// Add all smaller files listed in base_
for (std::vector<FileMetaData*>::const_iterator bpos
= std::upper_bound(base_iter, base_end, *added_iter, cmp);
base_iter != bpos;
++base_iter) {
MaybeAddFile(v, level, *base_iter);
}
MaybeAddFile(v, level, *added_iter);
}
// Add remaining base files
for (; base_iter != base_end; ++base_iter) {
MaybeAddFile(v, level, *base_iter);
}
#ifndef NDEBUG
// Make sure there is no overlap in levels > 0
if (level > 0) {
for (uint32_t i = 1; i < v->files_[level].size(); i++) {
const InternalKey& prev_end = v->files_[level][i-1]->largest;
const InternalKey& this_begin = v->files_[level][i]->smallest;
if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
prev_end.DebugString().c_str(),
this_begin.DebugString().c_str());
abort();
}
}
}
#endif
}
}
void MaybeAddFile(Version* v, int level, FileMetaData* f) {
if (levels_[level].deleted_files.count(f->number) > 0) {
// File is deleted: do nothing
} else {
std::vector<FileMetaData*>* files = &v->files_[level];
if (level > 0 && !files->empty()) {
// Must not overlap
assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest,
f->smallest) < 0);
}
f->refs++;
files->push_back(f);
}
}
};
VersionSet::VersionSet(const std::string& dbname,
const Options* options,
TableCache* table_cache,
const InternalKeyComparator* cmp)
: env_(options->env),
dbname_(dbname),
options_(options),
table_cache_(table_cache),
icmp_(*cmp),
next_file_number_(2),
manifest_file_number_(0), // Filled by Recover()
last_sequence_(0),
log_number_(0),
prev_log_number_(0),
descriptor_file_(NULL),
descriptor_log_(NULL),
dummy_versions_(this),
current_(NULL) {
AppendVersion(new Version(this));
}
VersionSet::~VersionSet() {
current_->Unref();
assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty
delete descriptor_log_;
delete descriptor_file_;
}
void VersionSet::AppendVersion(Version* v) {
// Make "v" current
assert(v->refs_ == 0);
assert(v != current_);
if (current_ != NULL) {
current_->Unref();
}
current_ = v;
v->Ref();
// Append to linked list
v->prev_ = dummy_versions_.prev_;
v->next_ = &dummy_versions_;
v->prev_->next_ = v;
v->next_->prev_ = v;
}
Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
if (edit->has_log_number_) {
assert(edit->log_number_ >= log_number_);
assert(edit->log_number_ < next_file_number_);
} else {
edit->SetLogNumber(log_number_);
}
if (!edit->has_prev_log_number_) {
edit->SetPrevLogNumber(prev_log_number_);
}
edit->SetNextFile(next_file_number_);
edit->SetLastSequence(last_sequence_);
Version* v = new Version(this);
{
Builder builder(this, current_);
builder.Apply(edit);
builder.SaveTo(v);
}
Finalize(v);
// Initialize new descriptor log file if necessary by creating
// a temporary file that contains a snapshot of the current version.
std::string new_manifest_file;
Status s;
if (descriptor_log_ == NULL) {
// No reason to unlock *mu here since we only hit this path in the
// first call to LogAndApply (when opening the database).
assert(descriptor_file_ == NULL);
new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
edit->SetNextFile(next_file_number_);
s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
if (s.ok()) {
descriptor_log_ = new log::Writer(descriptor_file_);
s = WriteSnapshot(descriptor_log_);
}
}
// Unlock during expensive MANIFEST log write
{
mu->Unlock();
// Write new record to MANIFEST log
if (s.ok()) {
std::string record;
edit->EncodeTo(&record);
s = descriptor_log_->AddRecord(record);
if (s.ok()) {
s = descriptor_file_->Sync();
}
}
// If we just created a new descriptor file, install it by writing a
// new CURRENT file that points to it.
if (s.ok() && !new_manifest_file.empty()) {
s = SetCurrentFile(env_, dbname_, manifest_file_number_);
}
mu->Lock();
}
// Install the new version
if (s.ok()) {
AppendVersion(v);
log_number_ = edit->log_number_;
prev_log_number_ = edit->prev_log_number_;
} else {
delete v;
if (!new_manifest_file.empty()) {
delete descriptor_log_;
delete descriptor_file_;
descriptor_log_ = NULL;
descriptor_file_ = NULL;
env_->DeleteFile(new_manifest_file);
}
}
return s;
}
Status VersionSet::Recover() {
struct LogReporter : public log::Reader::Reporter {
Status* status;
virtual void Corruption(size_t bytes, const Status& s) {
if (this->status->ok()) *this->status = s;
}
};
// Read "CURRENT" file, which contains a pointer to the current manifest file
std::string current;
Status s = ReadFileToString(env_, CurrentFileName(dbname_), &current);
if (!s.ok()) {
return s;
}
if (current.empty() || current[current.size()-1] != '\n') {
return Status::Corruption("CURRENT file does not end with newline");
}
current.resize(current.size() - 1);
std::string dscname = dbname_ + "/" + current;
SequentialFile* file;
s = env_->NewSequentialFile(dscname, &file);
if (!s.ok()) {
return s;
}
bool have_log_number = false;
bool have_prev_log_number = false;
bool have_next_file = false;
bool have_last_sequence = false;
uint64_t next_file = 0;
uint64_t last_sequence = 0;
uint64_t log_number = 0;
uint64_t prev_log_number = 0;
Builder builder(this, current_);
{
LogReporter reporter;
reporter.status = &s;
log::Reader reader(file, &reporter, true/*checksum*/, 0/*initial_offset*/);
Slice record;
std::string scratch;
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
VersionEdit edit;
s = edit.DecodeFrom(record);
if (s.ok()) {
if (edit.has_comparator_ &&
edit.comparator_ != icmp_.user_comparator()->Name()) {
s = Status::InvalidArgument(
- edit.comparator_ + "does not match existing comparator ",
+ edit.comparator_ + " does not match existing comparator ",
icmp_.user_comparator()->Name());
}
}
if (s.ok()) {
builder.Apply(&edit);
}
if (edit.has_log_number_) {
log_number = edit.log_number_;
have_log_number = true;
}
if (edit.has_prev_log_number_) {
prev_log_number = edit.prev_log_number_;
have_prev_log_number = true;
}
if (edit.has_next_file_number_) {
next_file = edit.next_file_number_;
have_next_file = true;
}
if (edit.has_last_sequence_) {
last_sequence = edit.last_sequence_;
have_last_sequence = true;
}
}
}
delete file;
file = NULL;
if (s.ok()) {
if (!have_next_file) {
s = Status::Corruption("no meta-nextfile entry in descriptor");
} else if (!have_log_number) {
s = Status::Corruption("no meta-lognumber entry in descriptor");
} else if (!have_last_sequence) {
s = Status::Corruption("no last-sequence-number entry in descriptor");
}
if (!have_prev_log_number) {
prev_log_number = 0;
}
MarkFileNumberUsed(prev_log_number);
MarkFileNumberUsed(log_number);
}
if (s.ok()) {
Version* v = new Version(this);
builder.SaveTo(v);
// Install recovered version
Finalize(v);
AppendVersion(v);
manifest_file_number_ = next_file;
next_file_number_ = next_file + 1;
last_sequence_ = last_sequence;
log_number_ = log_number;
prev_log_number_ = prev_log_number;
}
return s;
}
void VersionSet::MarkFileNumberUsed(uint64_t number) {
if (next_file_number_ <= number) {
next_file_number_ = number + 1;
}
}
void VersionSet::Finalize(Version* v) {
// Precomputed best level for next compaction
int best_level = -1;
double best_score = -1;
for (int level = 0; level < config::kNumLevels-1; level++) {
double score;
if (level == 0) {
// We treat level-0 specially by bounding the number of files
// instead of number of bytes for two reasons:
//
// (1) With larger write-buffer sizes, it is nice not to do too
// many level-0 compactions.
//
// (2) The files in level-0 are merged on every read and
// therefore we wish to avoid too many files when the individual
// file size is small (perhaps because of a small write-buffer
// setting, or very high compression ratios, or lots of
// overwrites/deletions).
score = v->files_[level].size() /
static_cast<double>(config::kL0_CompactionTrigger);
} else {
// Compute the ratio of current size to size limit.
const uint64_t level_bytes = TotalFileSize(v->files_[level]);
score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
}
if (score > best_score) {
best_level = level;
best_score = score;
}
}
v->compaction_level_ = best_level;
v->compaction_score_ = best_score;
}
Status VersionSet::WriteSnapshot(log::Writer* log) {
// TODO: Break up into multiple records to reduce memory usage on recovery?
// Save metadata
VersionEdit edit;
edit.SetComparatorName(icmp_.user_comparator()->Name());
// Save compaction pointers
for (int level = 0; level < config::kNumLevels; level++) {
if (!compact_pointer_[level].empty()) {
InternalKey key;
key.DecodeFrom(compact_pointer_[level]);
edit.SetCompactPointer(level, key);
}
}
// Save files
for (int level = 0; level < config::kNumLevels; level++) {
const std::vector<FileMetaData*>& files = current_->files_[level];
for (size_t i = 0; i < files.size(); i++) {
const FileMetaData* f = files[i];
edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest);
}
}
std::string record;
edit.EncodeTo(&record);
return log->AddRecord(record);
}
int VersionSet::NumLevelFiles(int level) const {
assert(level >= 0);
assert(level < config::kNumLevels);
return current_->files_[level].size();
}
const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
// Update code if kNumLevels changes
assert(config::kNumLevels == 7);
snprintf(scratch->buffer, sizeof(scratch->buffer),
"files[ %d %d %d %d %d %d %d ]",
int(current_->files_[0].size()),
int(current_->files_[1].size()),
int(current_->files_[2].size()),
int(current_->files_[3].size()),
int(current_->files_[4].size()),
int(current_->files_[5].size()),
int(current_->files_[6].size()));
return scratch->buffer;
}
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
uint64_t result = 0;
for (int level = 0; level < config::kNumLevels; level++) {
const std::vector<FileMetaData*>& files = v->files_[level];
for (size_t i = 0; i < files.size(); i++) {
if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
// Entire file is before "ikey", so just add the file size
result += files[i]->file_size;
} else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
// Entire file is after "ikey", so ignore
if (level > 0) {
// Files other than level 0 are sorted by meta->smallest, so
// no further files in this level will contain data for
// "ikey".
break;
}
} else {
// "ikey" falls in the range for this table. Add the
// approximate offset of "ikey" within the table.
Table* tableptr;
Iterator* iter = table_cache_->NewIterator(
ReadOptions(), files[i]->number, files[i]->file_size, &tableptr);
if (tableptr != NULL) {
result += tableptr->ApproximateOffsetOf(ikey.Encode());
}
delete iter;
}
}
}
return result;
}
void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
for (Version* v = dummy_versions_.next_;
v != &dummy_versions_;
v = v->next_) {
for (int level = 0; level < config::kNumLevels; level++) {
const std::vector<FileMetaData*>& files = v->files_[level];
for (size_t i = 0; i < files.size(); i++) {
live->insert(files[i]->number);
}
}
}
}
int64_t VersionSet::NumLevelBytes(int level) const {
assert(level >= 0);
assert(level < config::kNumLevels);
return TotalFileSize(current_->files_[level]);
}
int64_t VersionSet::MaxNextLevelOverlappingBytes() {
int64_t result = 0;
std::vector<FileMetaData*> overlaps;
for (int level = 1; level < config::kNumLevels - 1; level++) {
for (size_t i = 0; i < current_->files_[level].size(); i++) {
const FileMetaData* f = current_->files_[level][i];
current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
&overlaps);
const int64_t sum = TotalFileSize(overlaps);
if (sum > result) {
result = sum;
}
}
}
return result;
}
// Stores the minimal range that covers all entries in inputs in
// *smallest, *largest.
// REQUIRES: inputs is not empty
void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest,
InternalKey* largest) {
assert(!inputs.empty());
smallest->Clear();
largest->Clear();
for (size_t i = 0; i < inputs.size(); i++) {
FileMetaData* f = inputs[i];
if (i == 0) {
*smallest = f->smallest;
*largest = f->largest;
} else {
if (icmp_.Compare(f->smallest, *smallest) < 0) {
*smallest = f->smallest;
}
if (icmp_.Compare(f->largest, *largest) > 0) {
*largest = f->largest;
}
}
}
}
// Stores the minimal range that covers all entries in inputs1 and inputs2
// in *smallest, *largest.
// REQUIRES: inputs is not empty
void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
const std::vector<FileMetaData*>& inputs2,
InternalKey* smallest,
InternalKey* largest) {
std::vector<FileMetaData*> all = inputs1;
all.insert(all.end(), inputs2.begin(), inputs2.end());
GetRange(all, smallest, largest);
}
Iterator* VersionSet::MakeInputIterator(Compaction* c) {
ReadOptions options;
options.verify_checksums = options_->paranoid_checks;
options.fill_cache = false;
// Level-0 files have to be merged together. For other levels,
// we will make a concatenating iterator per level.
// TODO(opt): use concatenating iterator for level-0 if there is no overlap
const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
Iterator** list = new Iterator*[space];
int num = 0;
for (int which = 0; which < 2; which++) {
if (!c->inputs_[which].empty()) {
if (c->level() + which == 0) {
const std::vector<FileMetaData*>& files = c->inputs_[which];
for (size_t i = 0; i < files.size(); i++) {
list[num++] = table_cache_->NewIterator(
options, files[i]->number, files[i]->file_size);
}
} else {
// Create concatenating iterator for the files from this level
list[num++] = NewTwoLevelIterator(
new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
&GetFileIterator, table_cache_, options);
}
}
}
assert(num <= space);
Iterator* result = NewMergingIterator(&icmp_, list, num);
delete[] list;
return result;
}
Compaction* VersionSet::PickCompaction() {
Compaction* c;
int level;
// We prefer compactions triggered by too much data in a level over
// the compactions triggered by seeks.
const bool size_compaction = (current_->compaction_score_ >= 1);
const bool seek_compaction = (current_->file_to_compact_ != NULL);
if (size_compaction) {
level = current_->compaction_level_;
assert(level >= 0);
assert(level+1 < config::kNumLevels);
c = new Compaction(level);
// Pick the first file that comes after compact_pointer_[level]
for (size_t i = 0; i < current_->files_[level].size(); i++) {
FileMetaData* f = current_->files_[level][i];
if (compact_pointer_[level].empty() ||
icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
c->inputs_[0].push_back(f);
break;
}
}
if (c->inputs_[0].empty()) {
// Wrap-around to the beginning of the key space
c->inputs_[0].push_back(current_->files_[level][0]);
}
} else if (seek_compaction) {
level = current_->file_to_compact_level_;
c = new Compaction(level);
c->inputs_[0].push_back(current_->file_to_compact_);
} else {
return NULL;
}
c->input_version_ = current_;
c->input_version_->Ref();
// Files in level 0 may overlap each other, so pick up all overlapping ones
if (level == 0) {
InternalKey smallest, largest;
GetRange(c->inputs_[0], &smallest, &largest);
// Note that the next call will discard the file we placed in
// c->inputs_[0] earlier and replace it with an overlapping set
// which will include the picked file.
current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
assert(!c->inputs_[0].empty());
}
SetupOtherInputs(c);
return c;
}
void VersionSet::SetupOtherInputs(Compaction* c) {
const int level = c->level();
InternalKey smallest, largest;
GetRange(c->inputs_[0], &smallest, &largest);
current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]);
// Get entire range covered by compaction
InternalKey all_start, all_limit;
GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
// See if we can grow the number of inputs in "level" without
// changing the number of "level+1" files we pick up.
if (!c->inputs_[1].empty()) {
std::vector<FileMetaData*> expanded0;
current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
const int64_t inputs0_size = TotalFileSize(c->inputs_[0]);
const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
const int64_t expanded0_size = TotalFileSize(expanded0);
if (expanded0.size() > c->inputs_[0].size() &&
inputs1_size + expanded0_size < kExpandedCompactionByteSizeLimit) {
InternalKey new_start, new_limit;
GetRange(expanded0, &new_start, &new_limit);
std::vector<FileMetaData*> expanded1;
current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
&expanded1);
if (expanded1.size() == c->inputs_[1].size()) {
Log(options_->info_log,
"Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n",
level,
int(c->inputs_[0].size()),
int(c->inputs_[1].size()),
long(inputs0_size), long(inputs1_size),
int(expanded0.size()),
int(expanded1.size()),
long(expanded0_size), long(inputs1_size));
smallest = new_start;
largest = new_limit;
c->inputs_[0] = expanded0;
c->inputs_[1] = expanded1;
GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
}
}
}
// Compute the set of grandparent files that overlap this compaction
// (parent == level+1; grandparent == level+2)
if (level + 2 < config::kNumLevels) {
current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
&c->grandparents_);
}
if (false) {
Log(options_->info_log, "Compacting %d '%s' .. '%s'",
level,
smallest.DebugString().c_str(),
largest.DebugString().c_str());
}
// Update the place where we will do the next compaction for this level.
// We update this immediately instead of waiting for the VersionEdit
// to be applied so that if the compaction fails, we will try a different
// key range next time.
compact_pointer_[level] = largest.Encode().ToString();
c->edit_.SetCompactPointer(level, largest);
}
Compaction* VersionSet::CompactRange(
int level,
const InternalKey* begin,
const InternalKey* end) {
std::vector<FileMetaData*> inputs;
current_->GetOverlappingInputs(level, begin, end, &inputs);
if (inputs.empty()) {
return NULL;
}
// Avoid compacting too much in one shot in case the range is large.
const uint64_t limit = MaxFileSizeForLevel(level);
uint64_t total = 0;
for (size_t i = 0; i < inputs.size(); i++) {
uint64_t s = inputs[i]->file_size;
total += s;
if (total >= limit) {
inputs.resize(i + 1);
break;
}
}
Compaction* c = new Compaction(level);
c->input_version_ = current_;
c->input_version_->Ref();
c->inputs_[0] = inputs;
SetupOtherInputs(c);
return c;
}
Compaction::Compaction(int level)
: level_(level),
max_output_file_size_(MaxFileSizeForLevel(level)),
input_version_(NULL),
grandparent_index_(0),
seen_key_(false),
overlapped_bytes_(0) {
for (int i = 0; i < config::kNumLevels; i++) {
level_ptrs_[i] = 0;
}
}
Compaction::~Compaction() {
if (input_version_ != NULL) {
input_version_->Unref();
}
}
bool Compaction::IsTrivialMove() const {
// Avoid a move if there is lots of overlapping grandparent data.
// Otherwise, the move could create a parent file that will require
// a very expensive merge later on.
return (num_input_files(0) == 1 &&
num_input_files(1) == 0 &&
TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes);
}
void Compaction::AddInputDeletions(VersionEdit* edit) {
for (int which = 0; which < 2; which++) {
for (size_t i = 0; i < inputs_[which].size(); i++) {
edit->DeleteFile(level_ + which, inputs_[which][i]->number);
}
}
}
bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
// Maybe use binary search to find right entry instead of linear search?
const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) {
const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
for (; level_ptrs_[lvl] < files.size(); ) {
FileMetaData* f = files[level_ptrs_[lvl]];
if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
// We've advanced far enough
if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
// Key falls in this file's range, so definitely not base level
return false;
}
break;
}
level_ptrs_[lvl]++;
}
}
return true;
}
bool Compaction::ShouldStopBefore(const Slice& internal_key) {
// Scan to find earliest grandparent file that contains key.
const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
while (grandparent_index_ < grandparents_.size() &&
icmp->Compare(internal_key,
grandparents_[grandparent_index_]->largest.Encode()) > 0) {
if (seen_key_) {
overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
}
grandparent_index_++;
}
seen_key_ = true;
if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) {
// Too much overlap for current output; start new output
overlapped_bytes_ = 0;
return true;
} else {
return false;
}
}
void Compaction::ReleaseInputs() {
if (input_version_ != NULL) {
input_version_->Unref();
input_version_ = NULL;
}
}
} // namespace leveldb
diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h
index 61c4c99a0..792899b7f 100644
--- a/src/leveldb/db/version_set.h
+++ b/src/leveldb/db/version_set.h
@@ -1,379 +1,381 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// The representation of a DBImpl consists of a set of Versions. The
// newest version is called "current". Older versions may be kept
// around to provide a consistent view to live iterators.
//
// Each Version keeps track of a set of Table files per level. The
// entire set of versions is maintained in a VersionSet.
//
// Version,VersionSet are thread-compatible, but require external
// synchronization on all accesses.
#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
#define STORAGE_LEVELDB_DB_VERSION_SET_H_
#include <map>
#include <set>
#include <vector>
#include "db/dbformat.h"
#include "db/version_edit.h"
#include "port/port.h"
+#include "port/thread_annotations.h"
namespace leveldb {
namespace log { class Writer; }
class Compaction;
class Iterator;
class MemTable;
class TableBuilder;
class TableCache;
class Version;
class VersionSet;
class WritableFile;
// Return the smallest index i such that files[i]->largest >= key.
// Return files.size() if there is no such file.
// REQUIRES: "files" contains a sorted list of non-overlapping files.
extern int FindFile(const InternalKeyComparator& icmp,
const std::vector<FileMetaData*>& files,
const Slice& key);
// Returns true iff some file in "files" overlaps the user key range
// [*smallest,*largest].
// smallest==NULL represents a key smaller than all keys in the DB.
// largest==NULL represents a key largest than all keys in the DB.
// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
// in sorted order.
extern bool SomeFileOverlapsRange(
const InternalKeyComparator& icmp,
bool disjoint_sorted_files,
const std::vector<FileMetaData*>& files,
const Slice* smallest_user_key,
const Slice* largest_user_key);
class Version {
public:
// Append to *iters a sequence of iterators that will
// yield the contents of this Version when merged together.
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
// Lookup the value for key. If found, store it in *val and
// return OK. Else return a non-OK status. Fills *stats.
// REQUIRES: lock is not held
struct GetStats {
FileMetaData* seek_file;
int seek_file_level;
};
Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
GetStats* stats);
// Adds "stats" into the current state. Returns true if a new
// compaction may need to be triggered, false otherwise.
// REQUIRES: lock is held
bool UpdateStats(const GetStats& stats);
// Reference count management (so Versions do not disappear out from
// under live iterators)
void Ref();
void Unref();
void GetOverlappingInputs(
int level,
const InternalKey* begin, // NULL means before all keys
const InternalKey* end, // NULL means after all keys
std::vector<FileMetaData*>* inputs);
// Returns true iff some file in the specified level overlaps
// some part of [*smallest_user_key,*largest_user_key].
// smallest_user_key==NULL represents a key smaller than all keys in the DB.
// largest_user_key==NULL represents a key largest than all keys in the DB.
bool OverlapInLevel(int level,
const Slice* smallest_user_key,
const Slice* largest_user_key);
// Return the level at which we should place a new memtable compaction
// result that covers the range [smallest_user_key,largest_user_key].
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
const Slice& largest_user_key);
int NumFiles(int level) const { return files_[level].size(); }
// Return a human readable string that describes this version's contents.
std::string DebugString() const;
private:
friend class Compaction;
friend class VersionSet;
class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list
int refs_; // Number of live refs to this version
// List of files per level
std::vector<FileMetaData*> files_[config::kNumLevels];
// Next file to compact based on seek stats.
FileMetaData* file_to_compact_;
int file_to_compact_level_;
// Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields
// are initialized by Finalize().
double compaction_score_;
int compaction_level_;
explicit Version(VersionSet* vset)
: vset_(vset), next_(this), prev_(this), refs_(0),
file_to_compact_(NULL),
file_to_compact_level_(-1),
compaction_score_(-1),
compaction_level_(-1) {
}
~Version();
// No copying allowed
Version(const Version&);
void operator=(const Version&);
};
class VersionSet {
public:
VersionSet(const std::string& dbname,
const Options* options,
TableCache* table_cache,
const InternalKeyComparator*);
~VersionSet();
// Apply *edit to the current version to form a new descriptor that
// is both saved to persistent state and installed as the new
// current version. Will release *mu while actually writing to the file.
// REQUIRES: *mu is held on entry.
// REQUIRES: no other thread concurrently calls LogAndApply()
- Status LogAndApply(VersionEdit* edit, port::Mutex* mu);
+ Status LogAndApply(VersionEdit* edit, port::Mutex* mu)
+ EXCLUSIVE_LOCKS_REQUIRED(mu);
// Recover the last saved descriptor from persistent storage.
Status Recover();
// Return the current version.
Version* current() const { return current_; }
// Return the current manifest file number
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
// Allocate and return a new file number
uint64_t NewFileNumber() { return next_file_number_++; }
// Arrange to reuse "file_number" unless a newer file number has
// already been allocated.
// REQUIRES: "file_number" was returned by a call to NewFileNumber().
void ReuseFileNumber(uint64_t file_number) {
if (next_file_number_ == file_number + 1) {
next_file_number_ = file_number;
}
}
// Return the number of Table files at the specified level.
int NumLevelFiles(int level) const;
// Return the combined file size of all files at the specified level.
int64_t NumLevelBytes(int level) const;
// Return the last sequence number.
uint64_t LastSequence() const { return last_sequence_; }
// Set the last sequence number to s.
void SetLastSequence(uint64_t s) {
assert(s >= last_sequence_);
last_sequence_ = s;
}
// Mark the specified file number as used.
void MarkFileNumberUsed(uint64_t number);
// Return the current log file number.
uint64_t LogNumber() const { return log_number_; }
// Return the log file number for the log file that is currently
// being compacted, or zero if there is no such log file.
uint64_t PrevLogNumber() const { return prev_log_number_; }
// Pick level and inputs for a new compaction.
// Returns NULL if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result.
Compaction* PickCompaction();
// Return a compaction object for compacting the range [begin,end] in
// the specified level. Returns NULL if there is nothing in that
// level that overlaps the specified range. Caller should delete
// the result.
Compaction* CompactRange(
int level,
const InternalKey* begin,
const InternalKey* end);
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
int64_t MaxNextLevelOverlappingBytes();
// Create an iterator that reads over the compaction inputs for "*c".
// The caller should delete the iterator when no longer needed.
Iterator* MakeInputIterator(Compaction* c);
// Returns true iff some level needs a compaction.
bool NeedsCompaction() const {
Version* v = current_;
return (v->compaction_score_ >= 1) || (v->file_to_compact_ != NULL);
}
// Add all files listed in any live version to *live.
// May also mutate some internal state.
void AddLiveFiles(std::set<uint64_t>* live);
// Return the approximate offset in the database of the data for
// "key" as of version "v".
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
// Return a human-readable short (single-line) summary of the number
// of files per level. Uses *scratch as backing store.
struct LevelSummaryStorage {
char buffer[100];
};
const char* LevelSummary(LevelSummaryStorage* scratch) const;
private:
class Builder;
friend class Compaction;
friend class Version;
void Finalize(Version* v);
void GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest,
InternalKey* largest);
void GetRange2(const std::vector<FileMetaData*>& inputs1,
const std::vector<FileMetaData*>& inputs2,
InternalKey* smallest,
InternalKey* largest);
void SetupOtherInputs(Compaction* c);
// Save current contents to *log
Status WriteSnapshot(log::Writer* log);
void AppendVersion(Version* v);
Env* const env_;
const std::string dbname_;
const Options* const options_;
TableCache* const table_cache_;
const InternalKeyComparator icmp_;
uint64_t next_file_number_;
uint64_t manifest_file_number_;
uint64_t last_sequence_;
uint64_t log_number_;
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
// Opened lazily
WritableFile* descriptor_file_;
log::Writer* descriptor_log_;
Version dummy_versions_; // Head of circular doubly-linked list of versions.
Version* current_; // == dummy_versions_.prev_
// Per-level key at which the next compaction at that level should start.
// Either an empty string, or a valid InternalKey.
std::string compact_pointer_[config::kNumLevels];
// No copying allowed
VersionSet(const VersionSet&);
void operator=(const VersionSet&);
};
// A Compaction encapsulates information about a compaction.
class Compaction {
public:
~Compaction();
// Return the level that is being compacted. Inputs from "level"
// and "level+1" will be merged to produce a set of "level+1" files.
int level() const { return level_; }
// Return the object that holds the edits to the descriptor done
// by this compaction.
VersionEdit* edit() { return &edit_; }
// "which" must be either 0 or 1
int num_input_files(int which) const { return inputs_[which].size(); }
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
// Maximum size of files to build during this compaction.
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
// Is this a trivial compaction that can be implemented by just
// moving a single input file to the next level (no merging or splitting)
bool IsTrivialMove() const;
// Add all inputs to this compaction as delete operations to *edit.
void AddInputDeletions(VersionEdit* edit);
// Returns true if the information we have available guarantees that
// the compaction is producing data in "level+1" for which no data exists
// in levels greater than "level+1".
bool IsBaseLevelForKey(const Slice& user_key);
// Returns true iff we should stop building the current output
// before processing "internal_key".
bool ShouldStopBefore(const Slice& internal_key);
// Release the input version for the compaction, once the compaction
// is successful.
void ReleaseInputs();
private:
friend class Version;
friend class VersionSet;
explicit Compaction(int level);
int level_;
uint64_t max_output_file_size_;
Version* input_version_;
VersionEdit edit_;
// Each compaction reads inputs from "level_" and "level_+1"
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
// State used to check for number of of overlapping grandparent files
// (parent == level_ + 1, grandparent == level_ + 2)
std::vector<FileMetaData*> grandparents_;
size_t grandparent_index_; // Index in grandparent_starts_
bool seen_key_; // Some output key has been seen
int64_t overlapped_bytes_; // Bytes of overlap between current output
// and grandparent files
// State for implementing IsBaseLevelForKey
// level_ptrs_ holds indices into input_version_->levels_: our state
// is that we are positioned at one of the file ranges for each
// higher level than the ones involved in this compaction (i.e. for
// all L >= level_ + 2).
size_t level_ptrs_[config::kNumLevels];
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_
diff --git a/src/leveldb/doc/bench/db_bench_sqlite3.cc b/src/leveldb/doc/bench/db_bench_sqlite3.cc
index 256793a9d..e63aaa8dc 100644
--- a/src/leveldb/doc/bench/db_bench_sqlite3.cc
+++ b/src/leveldb/doc/bench/db_bench_sqlite3.cc
@@ -1,718 +1,718 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h>
#include <stdlib.h>
#include <sqlite3.h>
#include "util/histogram.h"
#include "util/random.h"
#include "util/testutil.h"
// Comma-separated list of operations to run in the specified order
// Actual benchmarks:
//
// fillseq -- write N values in sequential key order in async mode
// fillseqsync -- write N/100 values in sequential key order in sync mode
// fillseqbatch -- batch write N values in sequential key order in async mode
// fillrandom -- write N values in random key order in async mode
// fillrandsync -- write N/100 values in random key order in sync mode
// fillrandbatch -- batch write N values in sequential key order in async mode
// overwrite -- overwrite N values in random key order in async mode
// fillrand100K -- write N/1000 100K values in random order in async mode
// fillseq100K -- write N/1000 100K values in sequential order in async mode
// readseq -- read N times sequentially
// readrandom -- read N times in random order
// readrand100K -- read N/1000 100K values in sequential order in async mode
static const char* FLAGS_benchmarks =
"fillseq,"
"fillseqsync,"
"fillseqbatch,"
"fillrandom,"
"fillrandsync,"
"fillrandbatch,"
"overwrite,"
"overwritebatch,"
"readrandom,"
"readseq,"
"fillrand100K,"
"fillseq100K,"
"readseq,"
"readrand100K,"
;
// Number of key/values to place in database
static int FLAGS_num = 1000000;
// Number of read operations to do. If negative, do FLAGS_num reads.
static int FLAGS_reads = -1;
// Size of each value
static int FLAGS_value_size = 100;
// Print histogram of operation timings
static bool FLAGS_histogram = false;
// Arrange to generate values that shrink to this fraction of
// their original size after compression
static double FLAGS_compression_ratio = 0.5;
// Page size. Default 1 KB.
static int FLAGS_page_size = 1024;
// Number of pages.
// Default cache size = FLAGS_page_size * FLAGS_num_pages = 4 MB.
static int FLAGS_num_pages = 4096;
// If true, do not destroy the existing database. If you set this
// flag and also specify a benchmark that wants a fresh database, that
// benchmark will fail.
static bool FLAGS_use_existing_db = false;
// If true, we allow batch writes to occur
static bool FLAGS_transaction = true;
// If true, we enable Write-Ahead Logging
static bool FLAGS_WAL_enabled = true;
// Use the db with the following name.
static const char* FLAGS_db = NULL;
inline
static void ExecErrorCheck(int status, char *err_msg) {
if (status != SQLITE_OK) {
fprintf(stderr, "SQL error: %s\n", err_msg);
sqlite3_free(err_msg);
exit(1);
}
}
inline
static void StepErrorCheck(int status) {
if (status != SQLITE_DONE) {
fprintf(stderr, "SQL step error: status = %d\n", status);
exit(1);
}
}
inline
static void ErrorCheck(int status) {
if (status != SQLITE_OK) {
fprintf(stderr, "sqlite3 error: status = %d\n", status);
exit(1);
}
}
inline
static void WalCheckpoint(sqlite3* db_) {
// Flush all writes to disk
if (FLAGS_WAL_enabled) {
sqlite3_wal_checkpoint_v2(db_, NULL, SQLITE_CHECKPOINT_FULL, NULL, NULL);
}
}
namespace leveldb {
// Helper for quickly generating random data.
namespace {
class RandomGenerator {
private:
std::string data_;
int pos_;
public:
RandomGenerator() {
// We use a limited amount of data over and over again and ensure
// that it is larger than the compression window (32KB), and also
// large enough to serve all typical value sizes we want to write.
Random rnd(301);
std::string piece;
while (data_.size() < 1048576) {
// Add a short fragment that is as compressible as specified
// by FLAGS_compression_ratio.
test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
data_.append(piece);
}
pos_ = 0;
}
Slice Generate(int len) {
if (pos_ + len > data_.size()) {
pos_ = 0;
assert(len < data_.size());
}
pos_ += len;
return Slice(data_.data() + pos_ - len, len);
}
};
static Slice TrimSpace(Slice s) {
int start = 0;
while (start < s.size() && isspace(s[start])) {
start++;
}
int limit = s.size();
while (limit > start && isspace(s[limit-1])) {
limit--;
}
return Slice(s.data() + start, limit - start);
}
} // namespace
class Benchmark {
private:
sqlite3* db_;
int db_num_;
int num_;
int reads_;
double start_;
double last_op_finish_;
int64_t bytes_;
std::string message_;
Histogram hist_;
RandomGenerator gen_;
Random rand_;
// State kept for progress messages
int done_;
int next_report_; // When to report next
void PrintHeader() {
const int kKeySize = 16;
PrintEnvironment();
fprintf(stdout, "Keys: %d bytes each\n", kKeySize);
fprintf(stdout, "Values: %d bytes each\n", FLAGS_value_size);
fprintf(stdout, "Entries: %d\n", num_);
fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
((static_cast<int64_t>(kKeySize + FLAGS_value_size) * num_)
/ 1048576.0));
PrintWarnings();
fprintf(stdout, "------------------------------------------------\n");
}
void PrintWarnings() {
#if defined(__GNUC__) && !defined(__OPTIMIZE__)
fprintf(stdout,
"WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
);
#endif
#ifndef NDEBUG
fprintf(stdout,
"WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
}
void PrintEnvironment() {
fprintf(stderr, "SQLite: version %s\n", SQLITE_VERSION);
#if defined(__linux)
time_t now = time(NULL);
fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline
FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
if (cpuinfo != NULL) {
char line[1000];
int num_cpus = 0;
std::string cpu_type;
std::string cache_size;
while (fgets(line, sizeof(line), cpuinfo) != NULL) {
const char* sep = strchr(line, ':');
if (sep == NULL) {
continue;
}
Slice key = TrimSpace(Slice(line, sep - 1 - line));
Slice val = TrimSpace(Slice(sep + 1));
if (key == "model name") {
++num_cpus;
cpu_type = val.ToString();
} else if (key == "cache size") {
cache_size = val.ToString();
}
}
fclose(cpuinfo);
fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
}
#endif
}
void Start() {
start_ = Env::Default()->NowMicros() * 1e-6;
bytes_ = 0;
message_.clear();
last_op_finish_ = start_;
hist_.Clear();
done_ = 0;
next_report_ = 100;
}
void FinishedSingleOp() {
if (FLAGS_histogram) {
double now = Env::Default()->NowMicros() * 1e-6;
double micros = (now - last_op_finish_) * 1e6;
hist_.Add(micros);
if (micros > 20000) {
fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
fflush(stderr);
}
last_op_finish_ = now;
}
done_++;
if (done_ >= next_report_) {
if (next_report_ < 1000) next_report_ += 100;
else if (next_report_ < 5000) next_report_ += 500;
else if (next_report_ < 10000) next_report_ += 1000;
else if (next_report_ < 50000) next_report_ += 5000;
else if (next_report_ < 100000) next_report_ += 10000;
else if (next_report_ < 500000) next_report_ += 50000;
else next_report_ += 100000;
fprintf(stderr, "... finished %d ops%30s\r", done_, "");
fflush(stderr);
}
}
void Stop(const Slice& name) {
double finish = Env::Default()->NowMicros() * 1e-6;
// Pretend at least one op was done in case we are running a benchmark
// that does not call FinishedSingleOp().
if (done_ < 1) done_ = 1;
if (bytes_ > 0) {
char rate[100];
snprintf(rate, sizeof(rate), "%6.1f MB/s",
(bytes_ / 1048576.0) / (finish - start_));
if (!message_.empty()) {
message_ = std::string(rate) + " " + message_;
} else {
message_ = rate;
}
}
fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
name.ToString().c_str(),
(finish - start_) * 1e6 / done_,
(message_.empty() ? "" : " "),
message_.c_str());
if (FLAGS_histogram) {
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
}
fflush(stdout);
}
public:
enum Order {
SEQUENTIAL,
RANDOM
};
enum DBState {
FRESH,
EXISTING
};
Benchmark()
: db_(NULL),
db_num_(0),
num_(FLAGS_num),
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
bytes_(0),
rand_(301) {
std::vector<std::string> files;
std::string test_dir;
Env::Default()->GetTestDirectory(&test_dir);
Env::Default()->GetChildren(test_dir, &files);
if (!FLAGS_use_existing_db) {
for (int i = 0; i < files.size(); i++) {
if (Slice(files[i]).starts_with("dbbench_sqlite3")) {
std::string file_name(test_dir);
file_name += "/";
file_name += files[i];
Env::Default()->DeleteFile(file_name.c_str());
}
}
}
}
~Benchmark() {
int status = sqlite3_close(db_);
ErrorCheck(status);
}
void Run() {
PrintHeader();
Open();
const char* benchmarks = FLAGS_benchmarks;
while (benchmarks != NULL) {
const char* sep = strchr(benchmarks, ',');
Slice name;
if (sep == NULL) {
name = benchmarks;
benchmarks = NULL;
} else {
name = Slice(benchmarks, sep - benchmarks);
benchmarks = sep + 1;
}
bytes_ = 0;
Start();
bool known = true;
bool write_sync = false;
if (name == Slice("fillseq")) {
Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
WalCheckpoint(db_);
} else if (name == Slice("fillseqbatch")) {
Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000);
WalCheckpoint(db_);
} else if (name == Slice("fillrandom")) {
Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1);
WalCheckpoint(db_);
} else if (name == Slice("fillrandbatch")) {
Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1000);
WalCheckpoint(db_);
} else if (name == Slice("overwrite")) {
Write(write_sync, RANDOM, EXISTING, num_, FLAGS_value_size, 1);
WalCheckpoint(db_);
} else if (name == Slice("overwritebatch")) {
Write(write_sync, RANDOM, EXISTING, num_, FLAGS_value_size, 1000);
WalCheckpoint(db_);
} else if (name == Slice("fillrandsync")) {
write_sync = true;
Write(write_sync, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1);
WalCheckpoint(db_);
} else if (name == Slice("fillseqsync")) {
write_sync = true;
Write(write_sync, SEQUENTIAL, FRESH, num_ / 100, FLAGS_value_size, 1);
WalCheckpoint(db_);
} else if (name == Slice("fillrand100K")) {
Write(write_sync, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1);
WalCheckpoint(db_);
} else if (name == Slice("fillseq100K")) {
Write(write_sync, SEQUENTIAL, FRESH, num_ / 1000, 100 * 1000, 1);
WalCheckpoint(db_);
} else if (name == Slice("readseq")) {
ReadSequential();
} else if (name == Slice("readrandom")) {
Read(RANDOM, 1);
} else if (name == Slice("readrand100K")) {
int n = reads_;
reads_ /= 1000;
Read(RANDOM, 1);
reads_ = n;
} else {
known = false;
if (name != Slice()) { // No error message for empty name
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
}
}
if (known) {
Stop(name);
}
}
}
void Open() {
assert(db_ == NULL);
int status;
char file_name[100];
char* err_msg = NULL;
db_num_++;
// Open database
std::string tmp_dir;
Env::Default()->GetTestDirectory(&tmp_dir);
snprintf(file_name, sizeof(file_name),
"%s/dbbench_sqlite3-%d.db",
tmp_dir.c_str(),
db_num_);
status = sqlite3_open(file_name, &db_);
if (status) {
fprintf(stderr, "open error: %s\n", sqlite3_errmsg(db_));
exit(1);
}
// Change SQLite cache size
char cache_size[100];
snprintf(cache_size, sizeof(cache_size), "PRAGMA cache_size = %d",
FLAGS_num_pages);
status = sqlite3_exec(db_, cache_size, NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
// FLAGS_page_size is defaulted to 1024
if (FLAGS_page_size != 1024) {
char page_size[100];
snprintf(page_size, sizeof(page_size), "PRAGMA page_size = %d",
FLAGS_page_size);
status = sqlite3_exec(db_, page_size, NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
}
// Change journal mode to WAL if WAL enabled flag is on
if (FLAGS_WAL_enabled) {
std::string WAL_stmt = "PRAGMA journal_mode = WAL";
// LevelDB's default cache size is a combined 4 MB
std::string WAL_checkpoint = "PRAGMA wal_autocheckpoint = 4096";
status = sqlite3_exec(db_, WAL_stmt.c_str(), NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
status = sqlite3_exec(db_, WAL_checkpoint.c_str(), NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
}
// Change locking mode to exclusive and create tables/index for database
std::string locking_stmt = "PRAGMA locking_mode = EXCLUSIVE";
std::string create_stmt =
"CREATE TABLE test (key blob, value blob, PRIMARY KEY(key))";
std::string stmt_array[] = { locking_stmt, create_stmt };
int stmt_array_length = sizeof(stmt_array) / sizeof(std::string);
for (int i = 0; i < stmt_array_length; i++) {
status = sqlite3_exec(db_, stmt_array[i].c_str(), NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
}
}
void Write(bool write_sync, Order order, DBState state,
int num_entries, int value_size, int entries_per_batch) {
// Create new database if state == FRESH
if (state == FRESH) {
if (FLAGS_use_existing_db) {
message_ = "skipping (--use_existing_db is true)";
return;
}
sqlite3_close(db_);
db_ = NULL;
Open();
Start();
}
if (num_entries != num_) {
char msg[100];
snprintf(msg, sizeof(msg), "(%d ops)", num_entries);
message_ = msg;
}
char* err_msg = NULL;
int status;
sqlite3_stmt *replace_stmt, *begin_trans_stmt, *end_trans_stmt;
std::string replace_str = "REPLACE INTO test (key, value) VALUES (?, ?)";
std::string begin_trans_str = "BEGIN TRANSACTION;";
std::string end_trans_str = "END TRANSACTION;";
// Check for synchronous flag in options
std::string sync_stmt = (write_sync) ? "PRAGMA synchronous = FULL" :
"PRAGMA synchronous = OFF";
status = sqlite3_exec(db_, sync_stmt.c_str(), NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
// Preparing sqlite3 statements
status = sqlite3_prepare_v2(db_, replace_str.c_str(), -1,
&replace_stmt, NULL);
ErrorCheck(status);
status = sqlite3_prepare_v2(db_, begin_trans_str.c_str(), -1,
&begin_trans_stmt, NULL);
ErrorCheck(status);
status = sqlite3_prepare_v2(db_, end_trans_str.c_str(), -1,
&end_trans_stmt, NULL);
ErrorCheck(status);
bool transaction = (entries_per_batch > 1);
for (int i = 0; i < num_entries; i += entries_per_batch) {
// Begin write transaction
if (FLAGS_transaction && transaction) {
status = sqlite3_step(begin_trans_stmt);
StepErrorCheck(status);
status = sqlite3_reset(begin_trans_stmt);
ErrorCheck(status);
}
// Create and execute SQL statements
for (int j = 0; j < entries_per_batch; j++) {
const char* value = gen_.Generate(value_size).data();
// Create values for key-value pair
const int k = (order == SEQUENTIAL) ? i + j :
(rand_.Next() % num_entries);
char key[100];
snprintf(key, sizeof(key), "%016d", k);
// Bind KV values into replace_stmt
status = sqlite3_bind_blob(replace_stmt, 1, key, 16, SQLITE_STATIC);
ErrorCheck(status);
status = sqlite3_bind_blob(replace_stmt, 2, value,
value_size, SQLITE_STATIC);
ErrorCheck(status);
// Execute replace_stmt
bytes_ += value_size + strlen(key);
status = sqlite3_step(replace_stmt);
StepErrorCheck(status);
// Reset SQLite statement for another use
status = sqlite3_clear_bindings(replace_stmt);
ErrorCheck(status);
status = sqlite3_reset(replace_stmt);
ErrorCheck(status);
FinishedSingleOp();
}
// End write transaction
if (FLAGS_transaction && transaction) {
status = sqlite3_step(end_trans_stmt);
StepErrorCheck(status);
status = sqlite3_reset(end_trans_stmt);
ErrorCheck(status);
}
}
status = sqlite3_finalize(replace_stmt);
ErrorCheck(status);
status = sqlite3_finalize(begin_trans_stmt);
ErrorCheck(status);
status = sqlite3_finalize(end_trans_stmt);
ErrorCheck(status);
}
void Read(Order order, int entries_per_batch) {
int status;
sqlite3_stmt *read_stmt, *begin_trans_stmt, *end_trans_stmt;
std::string read_str = "SELECT * FROM test WHERE key = ?";
std::string begin_trans_str = "BEGIN TRANSACTION;";
std::string end_trans_str = "END TRANSACTION;";
// Preparing sqlite3 statements
status = sqlite3_prepare_v2(db_, begin_trans_str.c_str(), -1,
&begin_trans_stmt, NULL);
ErrorCheck(status);
status = sqlite3_prepare_v2(db_, end_trans_str.c_str(), -1,
&end_trans_stmt, NULL);
ErrorCheck(status);
status = sqlite3_prepare_v2(db_, read_str.c_str(), -1, &read_stmt, NULL);
ErrorCheck(status);
bool transaction = (entries_per_batch > 1);
for (int i = 0; i < reads_; i += entries_per_batch) {
// Begin read transaction
if (FLAGS_transaction && transaction) {
status = sqlite3_step(begin_trans_stmt);
StepErrorCheck(status);
status = sqlite3_reset(begin_trans_stmt);
ErrorCheck(status);
}
// Create and execute SQL statements
for (int j = 0; j < entries_per_batch; j++) {
// Create key value
char key[100];
int k = (order == SEQUENTIAL) ? i + j : (rand_.Next() % reads_);
snprintf(key, sizeof(key), "%016d", k);
// Bind key value into read_stmt
status = sqlite3_bind_blob(read_stmt, 1, key, 16, SQLITE_STATIC);
ErrorCheck(status);
// Execute read statement
- while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW);
+ while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW) {}
StepErrorCheck(status);
// Reset SQLite statement for another use
status = sqlite3_clear_bindings(read_stmt);
ErrorCheck(status);
status = sqlite3_reset(read_stmt);
ErrorCheck(status);
FinishedSingleOp();
}
// End read transaction
if (FLAGS_transaction && transaction) {
status = sqlite3_step(end_trans_stmt);
StepErrorCheck(status);
status = sqlite3_reset(end_trans_stmt);
ErrorCheck(status);
}
}
status = sqlite3_finalize(read_stmt);
ErrorCheck(status);
status = sqlite3_finalize(begin_trans_stmt);
ErrorCheck(status);
status = sqlite3_finalize(end_trans_stmt);
ErrorCheck(status);
}
void ReadSequential() {
int status;
sqlite3_stmt *pStmt;
std::string read_str = "SELECT * FROM test ORDER BY key";
status = sqlite3_prepare_v2(db_, read_str.c_str(), -1, &pStmt, NULL);
ErrorCheck(status);
for (int i = 0; i < reads_ && SQLITE_ROW == sqlite3_step(pStmt); i++) {
bytes_ += sqlite3_column_bytes(pStmt, 1) + sqlite3_column_bytes(pStmt, 2);
FinishedSingleOp();
}
status = sqlite3_finalize(pStmt);
ErrorCheck(status);
}
};
} // namespace leveldb
int main(int argc, char** argv) {
std::string default_db_path;
for (int i = 1; i < argc; i++) {
double d;
int n;
char junk;
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_histogram = n;
} else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
FLAGS_compression_ratio = d;
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_existing_db = n;
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
FLAGS_num = n;
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
FLAGS_reads = n;
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
FLAGS_value_size = n;
} else if (leveldb::Slice(argv[i]) == leveldb::Slice("--no_transaction")) {
FLAGS_transaction = false;
} else if (sscanf(argv[i], "--page_size=%d%c", &n, &junk) == 1) {
FLAGS_page_size = n;
} else if (sscanf(argv[i], "--num_pages=%d%c", &n, &junk) == 1) {
FLAGS_num_pages = n;
} else if (sscanf(argv[i], "--WAL_enabled=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_WAL_enabled = n;
} else if (strncmp(argv[i], "--db=", 5) == 0) {
FLAGS_db = argv[i] + 5;
} else {
fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
exit(1);
}
}
// Choose a location for the test database if none given with --db=<path>
if (FLAGS_db == NULL) {
leveldb::Env::Default()->GetTestDirectory(&default_db_path);
default_db_path += "/dbbench";
FLAGS_db = default_db_path.c_str();
}
leveldb::Benchmark benchmark;
benchmark.Run();
return 0;
}
diff --git a/src/leveldb/doc/index.html b/src/leveldb/doc/index.html
index 521d2baf4..3ed0ed9d9 100644
--- a/src/leveldb/doc/index.html
+++ b/src/leveldb/doc/index.html
@@ -1,549 +1,549 @@
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" type="text/css" href="doc.css" />
<title>Leveldb</title>
</head>
<body>
<h1>Leveldb</h1>
<address>Jeff Dean, Sanjay Ghemawat</address>
<p>
The <code>leveldb</code> library provides a persistent key value store. Keys and
values are arbitrary byte arrays. The keys are ordered within the key
value store according to a user-specified comparator function.
<p>
<h1>Opening A Database</h1>
<p>
A <code>leveldb</code> database has a name which corresponds to a file system
directory. All of the contents of database are stored in this
directory. The following example shows how to open a database,
creating it if necessary:
<p>
<pre>
#include &lt;assert&gt;
#include "leveldb/db.h"
leveldb::DB* db;
leveldb::Options options;
options.create_if_missing = true;
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
assert(status.ok());
...
</pre>
If you want to raise an error if the database already exists, add
the following line before the <code>leveldb::DB::Open</code> call:
<pre>
options.error_if_exists = true;
</pre>
<h1>Status</h1>
<p>
You may have noticed the <code>leveldb::Status</code> type above. Values of this
type are returned by most functions in <code>leveldb</code> that may encounter an
error. You can check if such a result is ok, and also print an
associated error message:
<p>
<pre>
leveldb::Status s = ...;
if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
</pre>
<h1>Closing A Database</h1>
<p>
When you are done with a database, just delete the database object.
Example:
<p>
<pre>
... open the db as described above ...
... do something with db ...
delete db;
</pre>
<h1>Reads And Writes</h1>
<p>
The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
modify/query the database. For example, the following code
moves the value stored under key1 to key2.
<pre>
std::string value;
leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
if (s.ok()) s = db-&gt;Put(leveldb::WriteOptions(), key2, value);
if (s.ok()) s = db-&gt;Delete(leveldb::WriteOptions(), key1);
</pre>
<h1>Atomic Updates</h1>
<p>
Note that if the process dies after the Put of key2 but before the
delete of key1, the same value may be left stored under multiple keys.
Such problems can be avoided by using the <code>WriteBatch</code> class to
atomically apply a set of updates:
<p>
<pre>
#include "leveldb/write_batch.h"
...
std::string value;
leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
if (s.ok()) {
leveldb::WriteBatch batch;
batch.Delete(key1);
batch.Put(key2, value);
s = db-&gt;Write(leveldb::WriteOptions(), &amp;batch);
}
</pre>
The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
and these edits within the batch are applied in order. Note that we
called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
we do not end up erroneously dropping the value entirely.
<p>
Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
speed up bulk updates by placing lots of individual mutations into the
same batch.
<h1>Synchronous Writes</h1>
By default, each write to <code>leveldb</code> is asynchronous: it
returns after pushing the write from the process into the operating
system. The transfer from operating system memory to the underlying
persistent storage happens asynchronously. The <code>sync</code> flag
can be turned on for a particular write to make the write operation
not return until the data being written has been pushed all the way to
persistent storage. (On Posix systems, this is implemented by calling
either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
<code>msync(..., MS_SYNC)</code> before the write operation returns.)
<pre>
leveldb::WriteOptions write_options;
write_options.sync = true;
db-&gt;Put(write_options, ...);
</pre>
Asynchronous writes are often more than a thousand times as fast as
synchronous writes. The downside of asynchronous writes is that a
crash of the machine may cause the last few updates to be lost. Note
that a crash of just the writing process (i.e., not a reboot) will not
cause any loss since even when <code>sync</code> is false, an update
is pushed from the process memory into the operating system before it
is considered done.
<p>
Asynchronous writes can often be used safely. For example, when
loading a large amount of data into the database you can handle lost
updates by restarting the bulk load after a crash. A hybrid scheme is
also possible where every Nth write is synchronous, and in the event
of a crash, the bulk load is restarted just after the last synchronous
write finished by the previous run. (The synchronous write can update
a marker that describes where to restart on a crash.)
<p>
<code>WriteBatch</code> provides an alternative to asynchronous writes.
Multiple updates may be placed in the same <code>WriteBatch</code> and
applied together using a synchronous write (i.e.,
<code>write_options.sync</code> is set to true). The extra cost of
the synchronous write will be amortized across all of the writes in
the batch.
<p>
<h1>Concurrency</h1>
<p>
A database may only be opened by one process at a time.
The <code>leveldb</code> implementation acquires a lock from the
operating system to prevent misuse. Within a single process, the
same <code>leveldb::DB</code> object may be safely shared by multiple
concurrent threads. I.e., different threads may write into or fetch
iterators or call <code>Get</code> on the same database without any
external synchronization (the leveldb implementation will
automatically do the required synchronization). However other objects
(like Iterator and WriteBatch) may require external synchronization.
If two threads share such an object, they must protect access to it
using their own locking protocol. More details are available in
the public header files.
<p>
<h1>Iteration</h1>
<p>
The following example demonstrates how to print all key,value pairs
in a database.
<p>
<pre>
leveldb::Iterator* it = db-&gt;NewIterator(leveldb::ReadOptions());
for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": " &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
}
assert(it-&gt;status().ok()); // Check for any errors found during the scan
delete it;
</pre>
The following variation shows how to process just the keys in the
range <code>[start,limit)</code>:
<p>
<pre>
for (it-&gt;Seek(start);
it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
it-&gt;Next()) {
...
}
</pre>
You can also process entries in reverse order. (Caveat: reverse
iteration may be somewhat slower than forward iteration.)
<p>
<pre>
for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
...
}
</pre>
<h1>Snapshots</h1>
<p>
Snapshots provide consistent read-only views over the entire state of
the key-value store. <code>ReadOptions::snapshot</code> may be non-NULL to indicate
that a read should operate on a particular version of the DB state.
If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
implicit snapshot of the current state.
<p>
Snapshots are created by the DB::GetSnapshot() method:
<p>
<pre>
leveldb::ReadOptions options;
options.snapshot = db-&gt;GetSnapshot();
... apply some updates to db ...
leveldb::Iterator* iter = db-&gt;NewIterator(options);
... read using iter to view the state when the snapshot was created ...
delete iter;
db-&gt;ReleaseSnapshot(options.snapshot);
</pre>
Note that when a snapshot is no longer needed, it should be released
using the DB::ReleaseSnapshot interface. This allows the
implementation to get rid of state that was being maintained just to
support reading as of that snapshot.
<h1>Slice</h1>
<p>
The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
are instances of the <code>leveldb::Slice</code> type. <code>Slice</code> is a simple
structure that contains a length and a pointer to an external byte
array. Returning a <code>Slice</code> is a cheaper alternative to returning a
<code>std::string</code> since we do not need to copy potentially large keys and
values. In addition, <code>leveldb</code> methods do not return null-terminated
C-style strings since <code>leveldb</code> keys and values are allowed to
contain '\0' bytes.
<p>
C++ strings and null-terminated C-style strings can be easily converted
to a Slice:
<p>
<pre>
leveldb::Slice s1 = "hello";
std::string str("world");
leveldb::Slice s2 = str;
</pre>
A Slice can be easily converted back to a C++ string:
<pre>
std::string str = s1.ToString();
assert(str == std::string("hello"));
</pre>
Be careful when using Slices since it is up to the caller to ensure that
the external byte array into which the Slice points remains live while
the Slice is in use. For example, the following is buggy:
<p>
<pre>
leveldb::Slice slice;
if (...) {
std::string str = ...;
slice = str;
}
Use(slice);
</pre>
When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
backing storage for <code>slice</code> will disappear.
<p>
<h1>Comparators</h1>
<p>
The preceding examples used the default ordering function for key,
which orders bytes lexicographically. You can however supply a custom
comparator when opening a database. For example, suppose each
database key consists of two numbers and we should sort by the first
number, breaking ties by the second number. First, define a proper
subclass of <code>leveldb::Comparator</code> that expresses these rules:
<p>
<pre>
class TwoPartComparator : public leveldb::Comparator {
public:
// Three-way comparison function:
// if a &lt; b: negative result
// if a &gt; b: positive result
// else: zero result
int Compare(const leveldb::Slice&amp; a, const leveldb::Slice&amp; b) const {
int a1, a2, b1, b2;
ParseKey(a, &amp;a1, &amp;a2);
ParseKey(b, &amp;b1, &amp;b2);
if (a1 &lt; b1) return -1;
if (a1 &gt; b1) return +1;
if (a2 &lt; b2) return -1;
if (a2 &gt; b2) return +1;
return 0;
}
// Ignore the following methods for now:
const char* Name() const { return "TwoPartComparator"; }
void FindShortestSeparator(std::string*, const leveldb::Slice&amp;) const { }
void FindShortSuccessor(std::string*) const { }
};
</pre>
Now create a database using this custom comparator:
<p>
<pre>
TwoPartComparator cmp;
leveldb::DB* db;
leveldb::Options options;
options.create_if_missing = true;
options.comparator = &amp;cmp;
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
...
</pre>
<h2>Backwards compatibility</h2>
<p>
The result of the comparator's <code>Name</code> method is attached to the
database when it is created, and is checked on every subsequent
database open. If the name changes, the <code>leveldb::DB::Open</code> call will
fail. Therefore, change the name if and only if the new key format
and comparison function are incompatible with existing databases, and
it is ok to discard the contents of all existing databases.
<p>
You can however still gradually evolve your key format over time with
a little bit of pre-planning. For example, you could store a version
number at the end of each key (one byte should suffice for most uses).
When you wish to switch to a new key format (e.g., adding an optional
third part to the keys processed by <code>TwoPartComparator</code>),
(a) keep the same comparator name (b) increment the version number
for new keys (c) change the comparator function so it uses the
version numbers found in the keys to decide how to interpret them.
<p>
<h1>Performance</h1>
<p>
Performance can be tuned by changing the default values of the
types defined in <code>include/leveldb/options.h</code>.
<p>
<h2>Block size</h2>
<p>
<code>leveldb</code> groups adjacent keys together into the same block and such a
block is the unit of transfer to and from persistent storage. The
default block size is approximately 4096 uncompressed bytes.
Applications that mostly do bulk scans over the contents of the
database may wish to increase this size. Applications that do a lot
of point reads of small values may wish to switch to a smaller block
size if performance measurements indicate an improvement. There isn't
much benefit in using blocks smaller than one kilobyte, or larger than
a few megabytes. Also note that compression will be more effective
with larger block sizes.
<p>
<h2>Compression</h2>
<p>
Each block is individually compressed before being written to
persistent storage. Compression is on by default since the default
compression method is very fast, and is automatically disabled for
uncompressible data. In rare cases, applications may want to disable
compression entirely, but should only do so if benchmarks show a
performance improvement:
<p>
<pre>
leveldb::Options options;
options.compression = leveldb::kNoCompression;
... leveldb::DB::Open(options, name, ...) ....
</pre>
<h2>Cache</h2>
<p>
The contents of the database are stored in a set of files in the
filesystem and each file stores a sequence of compressed blocks. If
<code>options.cache</code> is non-NULL, it is used to cache frequently used
uncompressed block contents.
<p>
<pre>
#include "leveldb/cache.h"
leveldb::Options options;
options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache
leveldb::DB* db;
leveldb::DB::Open(options, name, &db);
... use the db ...
delete db
delete options.cache;
</pre>
Note that the cache holds uncompressed data, and therefore it should
be sized according to application level data sizes, without any
reduction from compression. (Caching of compressed blocks is left to
the operating system buffer cache, or any custom <code>Env</code>
implementation provided by the client.)
<p>
When performing a bulk read, the application may wish to disable
caching so that the data processed by the bulk read does not end up
displacing most of the cached contents. A per-iterator option can be
used to achieve this:
<p>
<pre>
leveldb::ReadOptions options;
options.fill_cache = false;
leveldb::Iterator* it = db-&gt;NewIterator(options);
for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
...
}
</pre>
<h2>Key Layout</h2>
<p>
Note that the unit of disk transfer and caching is a block. Adjacent
keys (according to the database sort order) will usually be placed in
the same block. Therefore the application can improve its performance
by placing keys that are accessed together near each other and placing
infrequently used keys in a separate region of the key space.
<p>
For example, suppose we are implementing a simple file system on top
of <code>leveldb</code>. The types of entries we might wish to store are:
<p>
<pre>
filename -&gt; permission-bits, length, list of file_block_ids
file_block_id -&gt; data
</pre>
We might want to prefix <code>filename</code> keys with one letter (say '/') and the
<code>file_block_id</code> keys with a different letter (say '0') so that scans
over just the metadata do not force us to fetch and cache bulky file
contents.
<p>
<h2>Filters</h2>
<p>
Because of the way <code>leveldb</code> data is organized on disk,
a single <code>Get()</code> call may involve multiple reads from disk.
The optional <code>FilterPolicy</code> mechanism can be used to reduce
the number of disk reads substantially.
<pre>
leveldb::Options options;
- options.filter_policy = NewBloomFilter(10);
+ options.filter_policy = NewBloomFilterPolicy(10);
leveldb::DB* db;
leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
... use the database ...
delete db;
delete options.filter_policy;
</pre>
The preceding code associates a
<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
based filtering policy with the database. Bloom filter based
filtering relies on keeping some number of bits of data in memory per
key (in this case 10 bits per key since that is the argument we passed
-to NewBloomFilter). This filter will reduce the number of unnecessary
+to NewBloomFilterPolicy). This filter will reduce the number of unnecessary
disk reads needed for <code>Get()</code> calls by a factor of
approximately a 100. Increasing the bits per key will lead to a
larger reduction at the cost of more memory usage. We recommend that
applications whose working set does not fit in memory and that do a
lot of random reads set a filter policy.
<p>
If you are using a custom comparator, you should ensure that the filter
policy you are using is compatible with your comparator. For example,
consider a comparator that ignores trailing spaces when comparing keys.
-<code>NewBloomFilter</code> must not be used with such a comparator.
+<code>NewBloomFilterPolicy</code> must not be used with such a comparator.
Instead, the application should provide a custom filter policy that
also ignores trailing spaces. For example:
<pre>
class CustomFilterPolicy : public leveldb::FilterPolicy {
private:
FilterPolicy* builtin_policy_;
public:
- CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
+ CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) { }
~CustomFilterPolicy() { delete builtin_policy_; }
const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
// Use builtin bloom filter code after removing trailing spaces
std::vector&lt;Slice&gt; trimmed(n);
for (int i = 0; i &lt; n; i++) {
trimmed[i] = RemoveTrailingSpaces(keys[i]);
}
return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
}
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
// Use builtin bloom filter code after removing trailing spaces
return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
}
};
</pre>
<p>
Advanced applications may provide a filter policy that does not use
a bloom filter but uses some other mechanism for summarizing a set
of keys. See <code>leveldb/filter_policy.h</code> for detail.
<p>
<h1>Checksums</h1>
<p>
<code>leveldb</code> associates checksums with all data it stores in the file system.
There are two separate controls provided over how aggressively these
checksums are verified:
<p>
<ul>
<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
checksum verification of all data that is read from the file system on
behalf of a particular read. By default, no such verification is
done.
<p>
<li> <code>Options::paranoid_checks</code> may be set to true before opening a
database to make the database implementation raise an error as soon as
it detects an internal corruption. Depending on which portion of the
database has been corrupted, the error may be raised when the database
is opened, or later by another database operation. By default,
paranoid checking is off so that the database can be used even if
parts of its persistent storage have been corrupted.
<p>
If a database is corrupted (perhaps it cannot be opened when
paranoid checking is turned on), the <code>leveldb::RepairDB</code> function
may be used to recover as much of the data as possible
<p>
</ul>
<h1>Approximate Sizes</h1>
<p>
The <code>GetApproximateSizes</code> method can used to get the approximate
number of bytes of file system space used by one or more key ranges.
<p>
<pre>
leveldb::Range ranges[2];
ranges[0] = leveldb::Range("a", "c");
ranges[1] = leveldb::Range("x", "z");
uint64_t sizes[2];
leveldb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
</pre>
The preceding call will set <code>sizes[0]</code> to the approximate number of
bytes of file system space used by the key range <code>[a..c)</code> and
<code>sizes[1]</code> to the approximate number of bytes used by the key range
<code>[x..z)</code>.
<p>
<h1>Environment</h1>
<p>
All file operations (and other operating system calls) issued by the
<code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object.
Sophisticated clients may wish to provide their own <code>Env</code>
implementation to get better control. For example, an application may
introduce artificial delays in the file IO paths to limit the impact
of <code>leveldb</code> on other activities in the system.
<p>
<pre>
class SlowEnv : public leveldb::Env {
.. implementation of the Env interface ...
};
SlowEnv env;
leveldb::Options options;
options.env = &amp;env;
Status s = leveldb::DB::Open(options, ...);
</pre>
<h1>Porting</h1>
<p>
<code>leveldb</code> may be ported to a new platform by providing platform
specific implementations of the types/methods/functions exported by
<code>leveldb/port/port.h</code>. See <code>leveldb/port/port_example.h</code> for more
details.
<p>
In addition, the new platform may need a new default <code>leveldb::Env</code>
implementation. See <code>leveldb/util/env_posix.h</code> for an example.
<h1>Other Information</h1>
<p>
Details about the <code>leveldb</code> implementation may be found in
the following documents:
<ul>
<li> <a href="impl.html">Implementation notes</a>
<li> <a href="table_format.txt">Format of an immutable Table file</a>
<li> <a href="log_format.txt">Format of a log file</a>
</ul>
</body>
</html>
diff --git a/src/leveldb/doc/log_format.txt b/src/leveldb/doc/log_format.txt
index 3a0414b65..5228f624d 100644
--- a/src/leveldb/doc/log_format.txt
+++ b/src/leveldb/doc/log_format.txt
@@ -1,75 +1,75 @@
The log file contents are a sequence of 32KB blocks. The only
exception is that the tail of the file may contain a partial block.
Each block consists of a sequence of records:
block := record* trailer?
record :=
- checksum: uint32 // crc32c of type and data[]
- length: uint16
+ checksum: uint32 // crc32c of type and data[] ; little-endian
+ length: uint16 // little-endian
type: uint8 // One of FULL, FIRST, MIDDLE, LAST
data: uint8[length]
A record never starts within the last six bytes of a block (since it
won't fit). Any leftover bytes here form the trailer, which must
consist entirely of zero bytes and must be skipped by readers.
Aside: if exactly seven bytes are left in the current block, and a new
non-zero length record is added, the writer must emit a FIRST record
(which contains zero bytes of user data) to fill up the trailing seven
bytes of the block and then emit all of the user data in subsequent
blocks.
More types may be added in the future. Some Readers may skip record
types they do not understand, others may report that some data was
skipped.
FULL == 1
FIRST == 2
MIDDLE == 3
LAST == 4
The FULL record contains the contents of an entire user record.
FIRST, MIDDLE, LAST are types used for user records that have been
split into multiple fragments (typically because of block boundaries).
FIRST is the type of the first fragment of a user record, LAST is the
type of the last fragment of a user record, and MID is the type of all
interior fragments of a user record.
Example: consider a sequence of user records:
A: length 1000
B: length 97270
C: length 8000
A will be stored as a FULL record in the first block.
B will be split into three fragments: first fragment occupies the rest
of the first block, second fragment occupies the entirety of the
second block, and the third fragment occupies a prefix of the third
block. This will leave six bytes free in the third block, which will
be left empty as the trailer.
C will be stored as a FULL record in the fourth block.
===================
Some benefits over the recordio format:
(1) We do not need any heuristics for resyncing - just go to next
block boundary and scan. If there is a corruption, skip to the next
block. As a side-benefit, we do not get confused when part of the
contents of one log file are embedded as a record inside another log
file.
(2) Splitting at approximate boundaries (e.g., for mapreduce) is
simple: find the next block boundary and skip records until we
hit a FULL or FIRST record.
(3) We do not need extra buffering for large records.
Some downsides compared to recordio format:
(1) No packing of tiny records. This could be fixed by adding a new
record type, so it is a shortcoming of the current implementation,
not necessarily the format.
(2) No compression. Again, this could be fixed by adding new record types.
diff --git a/src/leveldb/doc/table_format.txt b/src/leveldb/doc/table_format.txt
index d0f3065ed..ca8f9b446 100644
--- a/src/leveldb/doc/table_format.txt
+++ b/src/leveldb/doc/table_format.txt
@@ -1,102 +1,104 @@
File format
===========
<beginning_of_file>
[data block 1]
[data block 2]
...
[data block N]
[meta block 1]
...
[meta block K]
[metaindex block]
[index block]
[Footer] (fixed size; starts at file_size - sizeof(Footer))
<end_of_file>
The file contains internal pointers. Each such pointer is called
a BlockHandle and contains the following information:
offset: varint64
size: varint64
+See https://developers.google.com/protocol-buffers/docs/encoding#varints
+for an explanation of varint64 format.
(1) The sequence of key/value pairs in the file are stored in sorted
order and partitioned into a sequence of data blocks. These blocks
come one after another at the beginning of the file. Each data block
is formatted according to the code in block_builder.cc, and then
optionally compressed.
(2) After the data blocks we store a bunch of meta blocks. The
supported meta block types are described below. More meta block types
may be added in the future. Each meta block is again formatted using
block_builder.cc and then optionally compressed.
(3) A "metaindex" block. It contains one entry for every other meta
block where the key is the name of the meta block and the value is a
BlockHandle pointing to that meta block.
(4) An "index" block. This block contains one entry per data block,
where the key is a string >= last key in that data block and before
the first key in the successive data block. The value is the
BlockHandle for the data block.
(6) At the very end of the file is a fixed length footer that contains
the BlockHandle of the metaindex and index blocks as well as a magic number.
- metaindex_handle: char[p]; // Block handle for metaindex
- index_handle: char[q]; // Block handle for index
- padding: char[40-p-q]; // 0 bytes to make fixed length
- // (40==2*BlockHandle::kMaxEncodedLength)
- magic: fixed64; // == 0xdb4775248b80fb57
+ metaindex_handle: char[p]; // Block handle for metaindex
+ index_handle: char[q]; // Block handle for index
+ padding: char[40-p-q]; // zeroed bytes to make fixed length
+ // (40==2*BlockHandle::kMaxEncodedLength)
+ magic: fixed64; // == 0xdb4775248b80fb57 (little-endian)
"filter" Meta Block
-------------------
If a "FilterPolicy" was specified when the database was opened, a
filter block is stored in each table. The "metaindex" block contains
an entry that maps from "filter.<N>" to the BlockHandle for the filter
block where "<N>" is the string returned by the filter policy's
"Name()" method.
The filter block stores a sequence of filters, where filter i contains
the output of FilterPolicy::CreateFilter() on all keys that are stored
in a block whose file offset falls within the range
[ i*base ... (i+1)*base-1 ]
Currently, "base" is 2KB. So for example, if blocks X and Y start in
the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
converted to a filter by calling FilterPolicy::CreateFilter(), and the
resulting filter will be stored as the first filter in the filter
block.
The filter block is formatted as follows:
[filter 0]
[filter 1]
[filter 2]
...
[filter N-1]
[offset of filter 0] : 4 bytes
[offset of filter 1] : 4 bytes
[offset of filter 2] : 4 bytes
...
[offset of filter N-1] : 4 bytes
[offset of beginning of offset array] : 4 bytes
lg(base) : 1 byte
The offset array at the end of the filter block allows efficient
mapping from a data block offset to the corresponding filter.
"stats" Meta Block
------------------
This meta block contains a bunch of stats. The key is the name
of the statistic. The value contains the statistic.
TODO(postrelease): record following stats.
data size
index size
key size (uncompressed)
value size (uncompressed)
number of entries
number of data blocks
diff --git a/src/leveldb/helpers/memenv/memenv.cc b/src/leveldb/helpers/memenv/memenv.cc
index 2082083b3..5879de121 100644
--- a/src/leveldb/helpers/memenv/memenv.cc
+++ b/src/leveldb/helpers/memenv/memenv.cc
@@ -1,374 +1,384 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "helpers/memenv/memenv.h"
#include "leveldb/env.h"
#include "leveldb/status.h"
#include "port/port.h"
#include "util/mutexlock.h"
#include <map>
#include <string.h>
#include <string>
#include <vector>
namespace leveldb {
namespace {
class FileState {
public:
// FileStates are reference counted. The initial reference count is zero
// and the caller must call Ref() at least once.
FileState() : refs_(0), size_(0) {}
// Increase the reference count.
void Ref() {
MutexLock lock(&refs_mutex_);
++refs_;
}
// Decrease the reference count. Delete if this is the last reference.
void Unref() {
bool do_delete = false;
{
MutexLock lock(&refs_mutex_);
--refs_;
assert(refs_ >= 0);
if (refs_ <= 0) {
do_delete = true;
}
}
if (do_delete) {
delete this;
}
}
uint64_t Size() const { return size_; }
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
if (offset > size_) {
return Status::IOError("Offset greater than file size.");
}
const uint64_t available = size_ - offset;
if (n > available) {
n = available;
}
if (n == 0) {
*result = Slice();
return Status::OK();
}
size_t block = offset / kBlockSize;
size_t block_offset = offset % kBlockSize;
if (n <= kBlockSize - block_offset) {
// The requested bytes are all in the first block.
*result = Slice(blocks_[block] + block_offset, n);
return Status::OK();
}
size_t bytes_to_copy = n;
char* dst = scratch;
while (bytes_to_copy > 0) {
size_t avail = kBlockSize - block_offset;
if (avail > bytes_to_copy) {
avail = bytes_to_copy;
}
memcpy(dst, blocks_[block] + block_offset, avail);
bytes_to_copy -= avail;
dst += avail;
block++;
block_offset = 0;
}
*result = Slice(scratch, n);
return Status::OK();
}
Status Append(const Slice& data) {
const char* src = data.data();
size_t src_len = data.size();
while (src_len > 0) {
size_t avail;
size_t offset = size_ % kBlockSize;
if (offset != 0) {
// There is some room in the last block.
avail = kBlockSize - offset;
} else {
// No room in the last block; push new one.
blocks_.push_back(new char[kBlockSize]);
avail = kBlockSize;
}
if (avail > src_len) {
avail = src_len;
}
memcpy(blocks_.back() + offset, src, avail);
src_len -= avail;
src += avail;
size_ += avail;
}
return Status::OK();
}
private:
// Private since only Unref() should be used to delete it.
~FileState() {
for (std::vector<char*>::iterator i = blocks_.begin(); i != blocks_.end();
++i) {
delete [] *i;
}
}
// No copying allowed.
FileState(const FileState&);
void operator=(const FileState&);
port::Mutex refs_mutex_;
int refs_; // Protected by refs_mutex_;
// The following fields are not protected by any mutex. They are only mutable
// while the file is being written, and concurrent access is not allowed
// to writable files.
std::vector<char*> blocks_;
uint64_t size_;
enum { kBlockSize = 8 * 1024 };
};
class SequentialFileImpl : public SequentialFile {
public:
explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) {
file_->Ref();
}
~SequentialFileImpl() {
file_->Unref();
}
virtual Status Read(size_t n, Slice* result, char* scratch) {
Status s = file_->Read(pos_, n, result, scratch);
if (s.ok()) {
pos_ += result->size();
}
return s;
}
virtual Status Skip(uint64_t n) {
if (pos_ > file_->Size()) {
return Status::IOError("pos_ > file_->Size()");
}
const size_t available = file_->Size() - pos_;
if (n > available) {
n = available;
}
pos_ += n;
return Status::OK();
}
private:
FileState* file_;
size_t pos_;
};
class RandomAccessFileImpl : public RandomAccessFile {
public:
explicit RandomAccessFileImpl(FileState* file) : file_(file) {
file_->Ref();
}
~RandomAccessFileImpl() {
file_->Unref();
}
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
return file_->Read(offset, n, result, scratch);
}
private:
FileState* file_;
};
class WritableFileImpl : public WritableFile {
public:
WritableFileImpl(FileState* file) : file_(file) {
file_->Ref();
}
~WritableFileImpl() {
file_->Unref();
}
virtual Status Append(const Slice& data) {
return file_->Append(data);
}
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
private:
FileState* file_;
};
+class NoOpLogger : public Logger {
+ public:
+ virtual void Logv(const char* format, va_list ap) { }
+};
+
class InMemoryEnv : public EnvWrapper {
public:
explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
virtual ~InMemoryEnv() {
for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
i->second->Unref();
}
}
// Partial implementation of the Env interface.
virtual Status NewSequentialFile(const std::string& fname,
SequentialFile** result) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
*result = NULL;
return Status::IOError(fname, "File not found");
}
*result = new SequentialFileImpl(file_map_[fname]);
return Status::OK();
}
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
*result = NULL;
return Status::IOError(fname, "File not found");
}
*result = new RandomAccessFileImpl(file_map_[fname]);
return Status::OK();
}
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) != file_map_.end()) {
DeleteFileInternal(fname);
}
FileState* file = new FileState();
file->Ref();
file_map_[fname] = file;
*result = new WritableFileImpl(file);
return Status::OK();
}
virtual bool FileExists(const std::string& fname) {
MutexLock lock(&mutex_);
return file_map_.find(fname) != file_map_.end();
}
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) {
MutexLock lock(&mutex_);
result->clear();
for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
const std::string& filename = i->first;
if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
Slice(filename).starts_with(Slice(dir))) {
result->push_back(filename.substr(dir.size() + 1));
}
}
return Status::OK();
}
void DeleteFileInternal(const std::string& fname) {
if (file_map_.find(fname) == file_map_.end()) {
return;
}
file_map_[fname]->Unref();
file_map_.erase(fname);
}
virtual Status DeleteFile(const std::string& fname) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
return Status::IOError(fname, "File not found");
}
DeleteFileInternal(fname);
return Status::OK();
}
virtual Status CreateDir(const std::string& dirname) {
return Status::OK();
}
virtual Status DeleteDir(const std::string& dirname) {
return Status::OK();
}
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
return Status::IOError(fname, "File not found");
}
*file_size = file_map_[fname]->Size();
return Status::OK();
}
virtual Status RenameFile(const std::string& src,
const std::string& target) {
MutexLock lock(&mutex_);
if (file_map_.find(src) == file_map_.end()) {
return Status::IOError(src, "File not found");
}
DeleteFileInternal(target);
file_map_[target] = file_map_[src];
file_map_.erase(src);
return Status::OK();
}
virtual Status LockFile(const std::string& fname, FileLock** lock) {
*lock = new FileLock;
return Status::OK();
}
virtual Status UnlockFile(FileLock* lock) {
delete lock;
return Status::OK();
}
virtual Status GetTestDirectory(std::string* path) {
*path = "/test";
return Status::OK();
}
+ virtual Status NewLogger(const std::string& fname, Logger** result) {
+ *result = new NoOpLogger;
+ return Status::OK();
+ }
+
private:
// Map from filenames to FileState objects, representing a simple file system.
typedef std::map<std::string, FileState*> FileSystem;
port::Mutex mutex_;
FileSystem file_map_; // Protected by mutex_.
};
} // namespace
Env* NewMemEnv(Env* base_env) {
return new InMemoryEnv(base_env);
}
} // namespace leveldb
diff --git a/src/leveldb/include/leveldb/c.h b/src/leveldb/include/leveldb/c.h
index 70e3cc652..1fa58866c 100644
--- a/src/leveldb/include/leveldb/c.h
+++ b/src/leveldb/include/leveldb/c.h
@@ -1,275 +1,291 @@
/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file. See the AUTHORS file for names of contributors.
C bindings for leveldb. May be useful as a stable ABI that can be
used by programs that keep leveldb in a shared library, or for
a JNI api.
Does not support:
. getters for the option types
. custom comparators that implement key shortening
. capturing post-write-snapshot
. custom iter, db, env, cache implementations using just the C bindings
Some conventions:
(1) We expose just opaque struct pointers and functions to clients.
This allows us to change internal representations without having to
recompile clients.
(2) For simplicity, there is no equivalent to the Slice type. Instead,
the caller has to pass the pointer and length as separate
arguments.
(3) Errors are represented by a null-terminated c string. NULL
means no error. All operations that can raise an error are passed
a "char** errptr" as the last argument. One of the following must
be true on entry:
*errptr == NULL
*errptr points to a malloc()ed null-terminated error message
+ (On Windows, *errptr must have been malloc()-ed by this library.)
On success, a leveldb routine leaves *errptr unchanged.
On failure, leveldb frees the old value of *errptr and
set *errptr to a malloc()ed error message.
(4) Bools have the type unsigned char (0 == false; rest == true)
(5) All of the pointer arguments must be non-NULL.
*/
#ifndef STORAGE_LEVELDB_INCLUDE_C_H_
#define STORAGE_LEVELDB_INCLUDE_C_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
/* Exported types */
typedef struct leveldb_t leveldb_t;
typedef struct leveldb_cache_t leveldb_cache_t;
typedef struct leveldb_comparator_t leveldb_comparator_t;
typedef struct leveldb_env_t leveldb_env_t;
typedef struct leveldb_filelock_t leveldb_filelock_t;
typedef struct leveldb_filterpolicy_t leveldb_filterpolicy_t;
typedef struct leveldb_iterator_t leveldb_iterator_t;
typedef struct leveldb_logger_t leveldb_logger_t;
typedef struct leveldb_options_t leveldb_options_t;
typedef struct leveldb_randomfile_t leveldb_randomfile_t;
typedef struct leveldb_readoptions_t leveldb_readoptions_t;
typedef struct leveldb_seqfile_t leveldb_seqfile_t;
typedef struct leveldb_snapshot_t leveldb_snapshot_t;
typedef struct leveldb_writablefile_t leveldb_writablefile_t;
typedef struct leveldb_writebatch_t leveldb_writebatch_t;
typedef struct leveldb_writeoptions_t leveldb_writeoptions_t;
/* DB operations */
extern leveldb_t* leveldb_open(
const leveldb_options_t* options,
const char* name,
char** errptr);
extern void leveldb_close(leveldb_t* db);
extern void leveldb_put(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr);
extern void leveldb_delete(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
char** errptr);
extern void leveldb_write(
leveldb_t* db,
const leveldb_writeoptions_t* options,
leveldb_writebatch_t* batch,
char** errptr);
/* Returns NULL if not found. A malloc()ed array otherwise.
Stores the length of the array in *vallen. */
extern char* leveldb_get(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr);
extern leveldb_iterator_t* leveldb_create_iterator(
leveldb_t* db,
const leveldb_readoptions_t* options);
extern const leveldb_snapshot_t* leveldb_create_snapshot(
leveldb_t* db);
extern void leveldb_release_snapshot(
leveldb_t* db,
const leveldb_snapshot_t* snapshot);
/* Returns NULL if property name is unknown.
Else returns a pointer to a malloc()-ed null-terminated value. */
extern char* leveldb_property_value(
leveldb_t* db,
const char* propname);
extern void leveldb_approximate_sizes(
leveldb_t* db,
int num_ranges,
const char* const* range_start_key, const size_t* range_start_key_len,
const char* const* range_limit_key, const size_t* range_limit_key_len,
uint64_t* sizes);
extern void leveldb_compact_range(
leveldb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len);
/* Management operations */
extern void leveldb_destroy_db(
const leveldb_options_t* options,
const char* name,
char** errptr);
extern void leveldb_repair_db(
const leveldb_options_t* options,
const char* name,
char** errptr);
/* Iterator */
extern void leveldb_iter_destroy(leveldb_iterator_t*);
extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*);
extern void leveldb_iter_seek_to_first(leveldb_iterator_t*);
extern void leveldb_iter_seek_to_last(leveldb_iterator_t*);
extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen);
extern void leveldb_iter_next(leveldb_iterator_t*);
extern void leveldb_iter_prev(leveldb_iterator_t*);
extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
/* Write batch */
extern leveldb_writebatch_t* leveldb_writebatch_create();
extern void leveldb_writebatch_destroy(leveldb_writebatch_t*);
extern void leveldb_writebatch_clear(leveldb_writebatch_t*);
extern void leveldb_writebatch_put(
leveldb_writebatch_t*,
const char* key, size_t klen,
const char* val, size_t vlen);
extern void leveldb_writebatch_delete(
leveldb_writebatch_t*,
const char* key, size_t klen);
extern void leveldb_writebatch_iterate(
leveldb_writebatch_t*,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen));
/* Options */
extern leveldb_options_t* leveldb_options_create();
extern void leveldb_options_destroy(leveldb_options_t*);
extern void leveldb_options_set_comparator(
leveldb_options_t*,
leveldb_comparator_t*);
extern void leveldb_options_set_filter_policy(
leveldb_options_t*,
leveldb_filterpolicy_t*);
extern void leveldb_options_set_create_if_missing(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_error_if_exists(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_paranoid_checks(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
enum {
leveldb_no_compression = 0,
leveldb_snappy_compression = 1
};
extern void leveldb_options_set_compression(leveldb_options_t*, int);
/* Comparator */
extern leveldb_comparator_t* leveldb_comparator_create(
void* state,
void (*destructor)(void*),
int (*compare)(
void*,
const char* a, size_t alen,
const char* b, size_t blen),
const char* (*name)(void*));
extern void leveldb_comparator_destroy(leveldb_comparator_t*);
/* Filter policy */
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length),
unsigned char (*key_may_match)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length),
const char* (*name)(void*));
extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
int bits_per_key);
/* Read options */
extern leveldb_readoptions_t* leveldb_readoptions_create();
extern void leveldb_readoptions_destroy(leveldb_readoptions_t*);
extern void leveldb_readoptions_set_verify_checksums(
leveldb_readoptions_t*,
unsigned char);
extern void leveldb_readoptions_set_fill_cache(
leveldb_readoptions_t*, unsigned char);
extern void leveldb_readoptions_set_snapshot(
leveldb_readoptions_t*,
const leveldb_snapshot_t*);
/* Write options */
extern leveldb_writeoptions_t* leveldb_writeoptions_create();
extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*);
extern void leveldb_writeoptions_set_sync(
leveldb_writeoptions_t*, unsigned char);
/* Cache */
extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity);
extern void leveldb_cache_destroy(leveldb_cache_t* cache);
/* Env */
extern leveldb_env_t* leveldb_create_default_env();
extern void leveldb_env_destroy(leveldb_env_t*);
+/* Utility */
+
+/* Calls free(ptr).
+ REQUIRES: ptr was malloc()-ed and returned by one of the routines
+ in this file. Note that in certain cases (typically on Windows), you
+ may need to call this routine instead of free(ptr) to dispose of
+ malloc()-ed memory returned by this library. */
+extern void leveldb_free(void* ptr);
+
+/* Return the major version number for this release. */
+extern int leveldb_major_version();
+
+/* Return the minor version number for this release. */
+extern int leveldb_minor_version();
+
#ifdef __cplusplus
} /* end extern "C" */
#endif
#endif /* STORAGE_LEVELDB_INCLUDE_C_H_ */
diff --git a/src/leveldb/include/leveldb/db.h b/src/leveldb/include/leveldb/db.h
index ed56b87c3..79142f5b2 100644
--- a/src/leveldb/include/leveldb/db.h
+++ b/src/leveldb/include/leveldb/db.h
@@ -1,161 +1,161 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_
#define STORAGE_LEVELDB_INCLUDE_DB_H_
#include <stdint.h>
#include <stdio.h>
#include "leveldb/iterator.h"
#include "leveldb/options.h"
namespace leveldb {
// Update Makefile if you change these
static const int kMajorVersion = 1;
-static const int kMinorVersion = 5;
+static const int kMinorVersion = 7;
struct Options;
struct ReadOptions;
struct WriteOptions;
class WriteBatch;
// Abstract handle to particular state of a DB.
// A Snapshot is an immutable object and can therefore be safely
// accessed from multiple threads without any external synchronization.
class Snapshot {
protected:
virtual ~Snapshot();
};
// A range of keys
struct Range {
Slice start; // Included in the range
Slice limit; // Not included in the range
Range() { }
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
};
// A DB is a persistent ordered map from keys to values.
// A DB is safe for concurrent access from multiple threads without
// any external synchronization.
class DB {
public:
// Open the database with the specified "name".
// Stores a pointer to a heap-allocated database in *dbptr and returns
// OK on success.
// Stores NULL in *dbptr and returns a non-OK status on error.
// Caller should delete *dbptr when it is no longer needed.
static Status Open(const Options& options,
const std::string& name,
DB** dbptr);
DB() { }
virtual ~DB();
// Set the database entry for "key" to "value". Returns OK on success,
// and a non-OK status on error.
// Note: consider setting options.sync = true.
virtual Status Put(const WriteOptions& options,
const Slice& key,
const Slice& value) = 0;
// Remove the database entry (if any) for "key". Returns OK on
// success, and a non-OK status on error. It is not an error if "key"
// did not exist in the database.
// Note: consider setting options.sync = true.
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
// Apply the specified updates to the database.
// Returns OK on success, non-OK on failure.
// Note: consider setting options.sync = true.
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
// If the database contains an entry for "key" store the
// corresponding value in *value and return OK.
//
// If there is no entry for "key" leave *value unchanged and return
// a status for which Status::IsNotFound() returns true.
//
// May return some other Status on an error.
virtual Status Get(const ReadOptions& options,
const Slice& key, std::string* value) = 0;
// Return a heap-allocated iterator over the contents of the database.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
//
// Caller should delete the iterator when it is no longer needed.
// The returned iterator should be deleted before this db is deleted.
virtual Iterator* NewIterator(const ReadOptions& options) = 0;
// Return a handle to the current DB state. Iterators created with
// this handle will all observe a stable snapshot of the current DB
// state. The caller must call ReleaseSnapshot(result) when the
// snapshot is no longer needed.
virtual const Snapshot* GetSnapshot() = 0;
// Release a previously acquired snapshot. The caller must not
// use "snapshot" after this call.
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
// DB implementations can export properties about their state
// via this method. If "property" is a valid property understood by this
// DB implementation, fills "*value" with its current value and returns
// true. Otherwise returns false.
//
//
// Valid property names include:
//
// "leveldb.num-files-at-level<N>" - return the number of files at level <N>,
// where <N> is an ASCII representation of a level number (e.g. "0").
// "leveldb.stats" - returns a multi-line string that describes statistics
// about the internal operation of the DB.
// "leveldb.sstables" - returns a multi-line string that describes all
// of the sstables that make up the db contents.
virtual bool GetProperty(const Slice& property, std::string* value) = 0;
// For each i in [0,n-1], store in "sizes[i]", the approximate
// file system space used by keys in "[range[i].start .. range[i].limit)".
//
// Note that the returned sizes measure file system space usage, so
// if the user data compresses by a factor of ten, the returned
// sizes will be one-tenth the size of the corresponding user data size.
//
// The results may not include the sizes of recently written data.
virtual void GetApproximateSizes(const Range* range, int n,
uint64_t* sizes) = 0;
// Compact the underlying storage for the key range [*begin,*end].
// In particular, deleted and overwritten versions are discarded,
// and the data is rearranged to reduce the cost of operations
// needed to access the data. This operation should typically only
// be invoked by users who understand the underlying implementation.
//
// begin==NULL is treated as a key before all keys in the database.
// end==NULL is treated as a key after all keys in the database.
// Therefore the following call will compact the entire database:
// db->CompactRange(NULL, NULL);
virtual void CompactRange(const Slice* begin, const Slice* end) = 0;
private:
// No copying allowed
DB(const DB&);
void operator=(const DB&);
};
// Destroy the contents of the specified database.
// Be very careful using this method.
Status DestroyDB(const std::string& name, const Options& options);
// If a DB cannot be opened, you may attempt to call this method to
// resurrect as much of the contents of the database as possible.
// Some data may be lost, so be careful when calling this function
// on a database that contains important information.
Status RepairDB(const std::string& dbname, const Options& options);
} // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_DB_H_
diff --git a/src/leveldb/include/leveldb/env.h b/src/leveldb/include/leveldb/env.h
index 272066718..fa32289f5 100644
--- a/src/leveldb/include/leveldb/env.h
+++ b/src/leveldb/include/leveldb/env.h
@@ -1,323 +1,333 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An Env is an interface used by the leveldb implementation to access
// operating system functionality like the filesystem etc. Callers
// may wish to provide a custom Env object when opening a database to
// get fine gain control; e.g., to rate limit file system operations.
//
// All Env implementations are safe for concurrent access from
// multiple threads without any external synchronization.
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
#define STORAGE_LEVELDB_INCLUDE_ENV_H_
#include <cstdarg>
#include <string>
#include <vector>
#include <stdint.h>
#include "leveldb/status.h"
namespace leveldb {
class FileLock;
class Logger;
class RandomAccessFile;
class SequentialFile;
class Slice;
class WritableFile;
class Env {
public:
Env() { }
virtual ~Env();
// Return a default environment suitable for the current operating
// system. Sophisticated users may wish to provide their own Env
// implementation instead of relying on this default environment.
//
// The result of Default() belongs to leveldb and must never be deleted.
static Env* Default();
// Create a brand new sequentially-readable file with the specified name.
// On success, stores a pointer to the new file in *result and returns OK.
// On failure stores NULL in *result and returns non-OK. If the file does
// not exist, returns a non-OK status.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewSequentialFile(const std::string& fname,
SequentialFile** result) = 0;
// Create a brand new random access read-only file with the
// specified name. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores NULL in *result and
// returns non-OK. If the file does not exist, returns a non-OK
// status.
//
// The returned file may be concurrently accessed by multiple threads.
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result) = 0;
// Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a
// new file. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores NULL in *result and
// returns non-OK.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) = 0;
// Returns true iff the named file exists.
virtual bool FileExists(const std::string& fname) = 0;
// Store in *result the names of the children of the specified directory.
// The names are relative to "dir".
// Original contents of *results are dropped.
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) = 0;
// Delete the named file.
virtual Status DeleteFile(const std::string& fname) = 0;
// Create the specified directory.
virtual Status CreateDir(const std::string& dirname) = 0;
// Delete the specified directory.
virtual Status DeleteDir(const std::string& dirname) = 0;
// Store the size of fname in *file_size.
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
// Rename file src to target.
virtual Status RenameFile(const std::string& src,
const std::string& target) = 0;
// Lock the specified file. Used to prevent concurrent access to
// the same db by multiple processes. On failure, stores NULL in
// *lock and returns non-OK.
//
// On success, stores a pointer to the object that represents the
// acquired lock in *lock and returns OK. The caller should call
// UnlockFile(*lock) to release the lock. If the process exits,
// the lock will be automatically released.
//
// If somebody else already holds the lock, finishes immediately
// with a failure. I.e., this call does not wait for existing locks
// to go away.
//
// May create the named file if it does not already exist.
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
// Release the lock acquired by a previous successful call to LockFile.
// REQUIRES: lock was returned by a successful LockFile() call
// REQUIRES: lock has not already been unlocked.
virtual Status UnlockFile(FileLock* lock) = 0;
// Arrange to run "(*function)(arg)" once in a background thread.
//
// "function" may run in an unspecified thread. Multiple functions
// added to the same Env may run concurrently in different threads.
// I.e., the caller may not assume that background work items are
// serialized.
virtual void Schedule(
void (*function)(void* arg),
void* arg) = 0;
// Start a new thread, invoking "function(arg)" within the new thread.
// When "function(arg)" returns, the thread will be destroyed.
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
// *path is set to a temporary directory that can be used for testing. It may
// or many not have just been created. The directory may or may not differ
// between runs of the same process, but subsequent calls will return the
// same directory.
virtual Status GetTestDirectory(std::string* path) = 0;
// Create and return a log file for storing informational messages.
virtual Status NewLogger(const std::string& fname, Logger** result) = 0;
// Returns the number of micro-seconds since some fixed point in time. Only
// useful for computing deltas of time.
virtual uint64_t NowMicros() = 0;
// Sleep/delay the thread for the perscribed number of micro-seconds.
virtual void SleepForMicroseconds(int micros) = 0;
private:
// No copying allowed
Env(const Env&);
void operator=(const Env&);
};
// A file abstraction for reading sequentially through a file
class SequentialFile {
public:
SequentialFile() { }
virtual ~SequentialFile();
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
// written by this routine. Sets "*result" to the data that was
// read (including if fewer than "n" bytes were successfully read).
// May set "*result" to point at data in "scratch[0..n-1]", so
// "scratch[0..n-1]" must be live when "*result" is used.
// If an error was encountered, returns a non-OK status.
//
// REQUIRES: External synchronization
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
// Skip "n" bytes from the file. This is guaranteed to be no
// slower that reading the same data, but may be faster.
//
// If end of file is reached, skipping will stop at the end of the
// file, and Skip will return OK.
//
// REQUIRES: External synchronization
virtual Status Skip(uint64_t n) = 0;
+
+ private:
+ // No copying allowed
+ SequentialFile(const SequentialFile&);
+ void operator=(const SequentialFile&);
};
// A file abstraction for randomly reading the contents of a file.
class RandomAccessFile {
public:
RandomAccessFile() { }
virtual ~RandomAccessFile();
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). May set "*result" to point at data in
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
// "*result" is used. If an error was encountered, returns a non-OK
// status.
//
// Safe for concurrent use by multiple threads.
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0;
+
+ private:
+ // No copying allowed
+ RandomAccessFile(const RandomAccessFile&);
+ void operator=(const RandomAccessFile&);
};
// A file abstraction for sequential writing. The implementation
// must provide buffering since callers may append small fragments
// at a time to the file.
class WritableFile {
public:
WritableFile() { }
virtual ~WritableFile();
virtual Status Append(const Slice& data) = 0;
virtual Status Close() = 0;
virtual Status Flush() = 0;
virtual Status Sync() = 0;
private:
// No copying allowed
WritableFile(const WritableFile&);
void operator=(const WritableFile&);
};
// An interface for writing log messages.
class Logger {
public:
Logger() { }
virtual ~Logger();
// Write an entry to the log file with the specified format.
virtual void Logv(const char* format, va_list ap) = 0;
private:
// No copying allowed
Logger(const Logger&);
void operator=(const Logger&);
};
// Identifies a locked file.
class FileLock {
public:
FileLock() { }
virtual ~FileLock();
private:
// No copying allowed
FileLock(const FileLock&);
void operator=(const FileLock&);
};
// Log the specified data to *info_log if info_log is non-NULL.
extern void Log(Logger* info_log, const char* format, ...)
# if defined(__GNUC__) || defined(__clang__)
__attribute__((__format__ (__printf__, 2, 3)))
# endif
;
// A utility routine: write "data" to the named file.
extern Status WriteStringToFile(Env* env, const Slice& data,
const std::string& fname);
// A utility routine: read contents of named file into *data
extern Status ReadFileToString(Env* env, const std::string& fname,
std::string* data);
// An implementation of Env that forwards all calls to another Env.
// May be useful to clients who wish to override just part of the
// functionality of another Env.
class EnvWrapper : public Env {
public:
// Initialize an EnvWrapper that delegates all calls to *t
explicit EnvWrapper(Env* t) : target_(t) { }
virtual ~EnvWrapper();
// Return the target to which this Env forwards all calls
Env* target() const { return target_; }
// The following text is boilerplate that forwards all methods to target()
Status NewSequentialFile(const std::string& f, SequentialFile** r) {
return target_->NewSequentialFile(f, r);
}
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
return target_->NewRandomAccessFile(f, r);
}
Status NewWritableFile(const std::string& f, WritableFile** r) {
return target_->NewWritableFile(f, r);
}
bool FileExists(const std::string& f) { return target_->FileExists(f); }
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
return target_->GetChildren(dir, r);
}
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
Status GetFileSize(const std::string& f, uint64_t* s) {
return target_->GetFileSize(f, s);
}
Status RenameFile(const std::string& s, const std::string& t) {
return target_->RenameFile(s, t);
}
Status LockFile(const std::string& f, FileLock** l) {
return target_->LockFile(f, l);
}
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
void Schedule(void (*f)(void*), void* a) {
return target_->Schedule(f, a);
}
void StartThread(void (*f)(void*), void* a) {
return target_->StartThread(f, a);
}
virtual Status GetTestDirectory(std::string* path) {
return target_->GetTestDirectory(path);
}
virtual Status NewLogger(const std::string& fname, Logger** result) {
return target_->NewLogger(fname, result);
}
uint64_t NowMicros() {
return target_->NowMicros();
}
void SleepForMicroseconds(int micros) {
target_->SleepForMicroseconds(micros);
}
private:
Env* target_;
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_
diff --git a/src/leveldb/port/atomic_pointer.h b/src/leveldb/port/atomic_pointer.h
index c58bffbf1..e17bf435e 100644
--- a/src/leveldb/port/atomic_pointer.h
+++ b/src/leveldb/port/atomic_pointer.h
@@ -1,152 +1,224 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// AtomicPointer provides storage for a lock-free pointer.
// Platform-dependent implementation of AtomicPointer:
// - If the platform provides a cheap barrier, we use it with raw pointers
// - If cstdatomic is present (on newer versions of gcc, it is), we use
// a cstdatomic-based AtomicPointer. However we prefer the memory
// barrier based version, because at least on a gcc 4.4 32-bit build
// on linux, we have encountered a buggy <cstdatomic>
// implementation. Also, some <cstdatomic> implementations are much
// slower than a memory-barrier based implementation (~16ns for
// <cstdatomic> based acquire-load vs. ~1ns for a barrier based
// acquire-load).
// This code is based on atomicops-internals-* in Google's perftools:
// http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase
#ifndef PORT_ATOMIC_POINTER_H_
#define PORT_ATOMIC_POINTER_H_
#include <stdint.h>
#ifdef LEVELDB_CSTDATOMIC_PRESENT
#include <cstdatomic>
#endif
#ifdef OS_WIN
#include <windows.h>
#endif
#ifdef OS_MACOSX
#include <libkern/OSAtomic.h>
#endif
#if defined(_M_X64) || defined(__x86_64__)
#define ARCH_CPU_X86_FAMILY 1
#elif defined(_M_IX86) || defined(__i386__) || defined(__i386)
#define ARCH_CPU_X86_FAMILY 1
#elif defined(__ARMEL__)
#define ARCH_CPU_ARM_FAMILY 1
+#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
+#define ARCH_CPU_PPC_FAMILY 1
#endif
namespace leveldb {
namespace port {
// Define MemoryBarrier() if available
// Windows on x86
#if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY)
// windows.h already provides a MemoryBarrier(void) macro
// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx
#define LEVELDB_HAVE_MEMORY_BARRIER
// Gcc on x86
#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__)
inline void MemoryBarrier() {
// See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
// this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
__asm__ __volatile__("" : : : "memory");
}
#define LEVELDB_HAVE_MEMORY_BARRIER
// Sun Studio
#elif defined(ARCH_CPU_X86_FAMILY) && defined(__SUNPRO_CC)
inline void MemoryBarrier() {
// See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
// this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
asm volatile("" : : : "memory");
}
#define LEVELDB_HAVE_MEMORY_BARRIER
// Mac OS
#elif defined(OS_MACOSX)
inline void MemoryBarrier() {
OSMemoryBarrier();
}
#define LEVELDB_HAVE_MEMORY_BARRIER
// ARM Linux
#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__)
typedef void (*LinuxKernelMemoryBarrierFunc)(void);
// The Linux ARM kernel provides a highly optimized device-specific memory
// barrier function at a fixed memory address that is mapped in every
// user-level process.
//
// This beats using CPU-specific instructions which are, on single-core
// devices, un-necessary and very costly (e.g. ARMv7-A "dmb" takes more
// than 180ns on a Cortex-A8 like the one on a Nexus One). Benchmarking
// shows that the extra function call cost is completely negligible on
// multi-core devices.
//
inline void MemoryBarrier() {
(*(LinuxKernelMemoryBarrierFunc)0xffff0fa0)();
}
#define LEVELDB_HAVE_MEMORY_BARRIER
+// PPC
+#elif defined(ARCH_CPU_PPC_FAMILY) && defined(__GNUC__)
+inline void MemoryBarrier() {
+ // TODO for some powerpc expert: is there a cheaper suitable variant?
+ // Perhaps by having separate barriers for acquire and release ops.
+ asm volatile("sync" : : : "memory");
+}
+#define LEVELDB_HAVE_MEMORY_BARRIER
+
#endif
// AtomicPointer built using platform-specific MemoryBarrier()
#if defined(LEVELDB_HAVE_MEMORY_BARRIER)
class AtomicPointer {
private:
void* rep_;
public:
AtomicPointer() { }
explicit AtomicPointer(void* p) : rep_(p) {}
inline void* NoBarrier_Load() const { return rep_; }
inline void NoBarrier_Store(void* v) { rep_ = v; }
inline void* Acquire_Load() const {
void* result = rep_;
MemoryBarrier();
return result;
}
inline void Release_Store(void* v) {
MemoryBarrier();
rep_ = v;
}
};
// AtomicPointer based on <cstdatomic>
#elif defined(LEVELDB_CSTDATOMIC_PRESENT)
class AtomicPointer {
private:
std::atomic<void*> rep_;
public:
AtomicPointer() { }
explicit AtomicPointer(void* v) : rep_(v) { }
inline void* Acquire_Load() const {
return rep_.load(std::memory_order_acquire);
}
inline void Release_Store(void* v) {
rep_.store(v, std::memory_order_release);
}
inline void* NoBarrier_Load() const {
return rep_.load(std::memory_order_relaxed);
}
inline void NoBarrier_Store(void* v) {
rep_.store(v, std::memory_order_relaxed);
}
};
+// Atomic pointer based on sparc memory barriers
+#elif defined(__sparcv9) && defined(__GNUC__)
+class AtomicPointer {
+ private:
+ void* rep_;
+ public:
+ AtomicPointer() { }
+ explicit AtomicPointer(void* v) : rep_(v) { }
+ inline void* Acquire_Load() const {
+ void* val;
+ __asm__ __volatile__ (
+ "ldx [%[rep_]], %[val] \n\t"
+ "membar #LoadLoad|#LoadStore \n\t"
+ : [val] "=r" (val)
+ : [rep_] "r" (&rep_)
+ : "memory");
+ return val;
+ }
+ inline void Release_Store(void* v) {
+ __asm__ __volatile__ (
+ "membar #LoadStore|#StoreStore \n\t"
+ "stx %[v], [%[rep_]] \n\t"
+ :
+ : [rep_] "r" (&rep_), [v] "r" (v)
+ : "memory");
+ }
+ inline void* NoBarrier_Load() const { return rep_; }
+ inline void NoBarrier_Store(void* v) { rep_ = v; }
+};
+
+// Atomic pointer based on ia64 acq/rel
+#elif defined(__ia64) && defined(__GNUC__)
+class AtomicPointer {
+ private:
+ void* rep_;
+ public:
+ AtomicPointer() { }
+ explicit AtomicPointer(void* v) : rep_(v) { }
+ inline void* Acquire_Load() const {
+ void* val ;
+ __asm__ __volatile__ (
+ "ld8.acq %[val] = [%[rep_]] \n\t"
+ : [val] "=r" (val)
+ : [rep_] "r" (&rep_)
+ : "memory"
+ );
+ return val;
+ }
+ inline void Release_Store(void* v) {
+ __asm__ __volatile__ (
+ "st8.rel [%[rep_]] = %[v] \n\t"
+ :
+ : [rep_] "r" (&rep_), [v] "r" (v)
+ : "memory"
+ );
+ }
+ inline void* NoBarrier_Load() const { return rep_; }
+ inline void NoBarrier_Store(void* v) { rep_ = v; }
+};
+
// We have neither MemoryBarrier(), nor <cstdatomic>
#else
#error Please implement AtomicPointer for this platform.
#endif
#undef LEVELDB_HAVE_MEMORY_BARRIER
#undef ARCH_CPU_X86_FAMILY
#undef ARCH_CPU_ARM_FAMILY
+#undef ARCH_CPU_PPC_FAMILY
} // namespace port
} // namespace leveldb
#endif // PORT_ATOMIC_POINTER_H_
diff --git a/src/leveldb/port/port.h b/src/leveldb/port/port.h
index 4baafa8e2..e667db40d 100644
--- a/src/leveldb/port/port.h
+++ b/src/leveldb/port/port.h
@@ -1,21 +1,19 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_PORT_PORT_H_
#define STORAGE_LEVELDB_PORT_PORT_H_
#include <string.h>
// Include the appropriate platform specific file below. If you are
// porting to a new platform, see "port_example.h" for documentation
// of what the new port_<platform>.h file must provide.
#if defined(LEVELDB_PLATFORM_POSIX)
# include "port/port_posix.h"
#elif defined(LEVELDB_PLATFORM_CHROMIUM)
# include "port/port_chromium.h"
-#elif defined(LEVELDB_PLATFORM_WINDOWS)
-# include "port/port_win.h"
#endif
#endif // STORAGE_LEVELDB_PORT_PORT_H_
diff --git a/src/leveldb/port/port_win.cc b/src/leveldb/port/port_win.cc
deleted file mode 100644
index 786cd6018..000000000
--- a/src/leveldb/port/port_win.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-// LevelDB Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// See port_example.h for documentation for the following types/functions.
-
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-// * Neither the name of the University of California, Berkeley nor the
-// names of its contributors may be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-
-#include "port/port_win.h"
-
-#include <windows.h>
-#include <cassert>
-
-namespace leveldb {
-namespace port {
-
-Mutex::Mutex() :
- mutex_(::CreateMutex(NULL, FALSE, NULL)) {
- assert(mutex_);
-}
-
-Mutex::~Mutex() {
- assert(mutex_);
- ::CloseHandle(mutex_);
-}
-
-void Mutex::Lock() {
- assert(mutex_);
- ::WaitForSingleObject(mutex_, INFINITE);
-}
-
-void Mutex::Unlock() {
- assert(mutex_);
- ::ReleaseMutex(mutex_);
-}
-
-void Mutex::AssertHeld() {
- assert(mutex_);
- assert(1);
-}
-
-CondVar::CondVar(Mutex* mu) :
- waiting_(0),
- mu_(mu),
- sema_(::CreateSemaphore(NULL, 0, 0x7fffffff, NULL)),
- event_(::CreateEvent(NULL, FALSE, FALSE, NULL)),
- broadcasted_(false){
- assert(mu_);
-}
-
-CondVar::~CondVar() {
- ::CloseHandle(sema_);
- ::CloseHandle(event_);
-}
-
-void CondVar::Wait() {
- wait_mtx_.Lock();
- ++waiting_;
- assert(waiting_ > 0);
- wait_mtx_.Unlock();
-
- ::SignalObjectAndWait(mu_->mutex_, sema_, INFINITE, FALSE);
-
- wait_mtx_.Lock();
- bool last = broadcasted_ && (--waiting_ == 0);
- assert(waiting_ >= 0);
- wait_mtx_.Unlock();
-
- // we leave this function with the mutex held
- if (last)
- {
- ::SignalObjectAndWait(event_, mu_->mutex_, INFINITE, FALSE);
- }
- else
- {
- ::WaitForSingleObject(mu_->mutex_, INFINITE);
- }
-}
-
-void CondVar::Signal() {
- wait_mtx_.Lock();
- bool waiters = waiting_ > 0;
- wait_mtx_.Unlock();
-
- if (waiters)
- {
- ::ReleaseSemaphore(sema_, 1, 0);
- }
-}
-
-void CondVar::SignalAll() {
- wait_mtx_.Lock();
-
- broadcasted_ = (waiting_ > 0);
-
- if (broadcasted_)
- {
- // release all
- ::ReleaseSemaphore(sema_, waiting_, 0);
- wait_mtx_.Unlock();
- ::WaitForSingleObject(event_, INFINITE);
- broadcasted_ = false;
- }
- else
- {
- wait_mtx_.Unlock();
- }
-}
-
-AtomicPointer::AtomicPointer(void* v) {
- Release_Store(v);
-}
-
-void* AtomicPointer::Acquire_Load() const {
- void * p = NULL;
- InterlockedExchangePointer(&p, rep_);
- return p;
-}
-
-void AtomicPointer::Release_Store(void* v) {
- InterlockedExchangePointer(&rep_, v);
-}
-
-void* AtomicPointer::NoBarrier_Load() const {
- return rep_;
-}
-
-void AtomicPointer::NoBarrier_Store(void* v) {
- rep_ = v;
-}
-
-enum InitializationState
-{
- Uninitialized = 0,
- Running = 1,
- Initialized = 2
-};
-
-void InitOnce(OnceType* once, void (*initializer)()) {
-
- assert(Uninitialized == LEVELDB_ONCE_INIT);
-
- InitializationState state = static_cast<InitializationState>(InterlockedCompareExchange(once, Running, Uninitialized));
-
- if (state == Uninitialized) {
- initializer();
- *once = Initialized;
- }
-
- if (state == Running) {
- while(*once != Initialized) {
- Sleep(0); // yield
- }
- }
-
- assert(*once == Initialized);
-}
-
-}
-}
diff --git a/src/leveldb/port/port_win.h b/src/leveldb/port/port_win.h
deleted file mode 100644
index 893919998..000000000
--- a/src/leveldb/port/port_win.h
+++ /dev/null
@@ -1,161 +0,0 @@
-// LevelDB Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// See port_example.h for documentation for the following types/functions.
-
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-// * Neither the name of the University of California, Berkeley nor the
-// names of its contributors may be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-
-#ifndef STORAGE_LEVELDB_PORT_PORT_WIN_H_
-#define STORAGE_LEVELDB_PORT_PORT_WIN_H_
-
-#ifdef _MSC_VER
-#define snprintf _snprintf
-#define close _close
-#define fread_unlocked _fread_nolock
-#endif
-
-
-#ifdef SNAPPY
-#include <snappy/snappy.h>
-#endif
-
-#include <string>
-
-#include <stdint.h>
-
-namespace leveldb {
-namespace port {
-
-// Windows is little endian (for now :p)
-static const bool kLittleEndian = true;
-
-class CondVar;
-
-class Mutex {
- public:
- Mutex();
- ~Mutex();
-
- void Lock();
- void Unlock();
- void AssertHeld();
-
- private:
- friend class CondVar;
- // critical sections are more efficient than mutexes
- // but they are not recursive and can only be used to synchronize threads within the same process
- // additionnaly they cannot be used with SignalObjectAndWait that we use for CondVar
- // we use opaque void * to avoid including windows.h in port_win.h
- void * mutex_;
-
- // No copying
- Mutex(const Mutex&);
- void operator=(const Mutex&);
-};
-
-// the Win32 API offers a dependable condition variable mechanism, but only starting with
-// Windows 2008 and Vista
-// no matter what we will implement our own condition variable with a semaphore
-// implementation as described in a paper written by Douglas C. Schmidt and Irfan Pyarali
-class CondVar {
- public:
- explicit CondVar(Mutex* mu);
- ~CondVar();
- void Wait();
- void Signal();
- void SignalAll();
- private:
- Mutex* mu_;
-
- Mutex wait_mtx_;
- long waiting_;
-
- void * sema_;
- void * event_;
-
- bool broadcasted_;
-};
-
-// Storage for a lock-free pointer
-class AtomicPointer {
- private:
- void * rep_;
- public:
- AtomicPointer() : rep_(NULL) { }
- explicit AtomicPointer(void* v);
- void* Acquire_Load() const;
-
- void Release_Store(void* v);
-
- void* NoBarrier_Load() const;
-
- void NoBarrier_Store(void* v);
-};
-
-typedef volatile long OnceType;
-#define LEVELDB_ONCE_INIT (0)
-
-extern void InitOnce(OnceType* once, void (*initializer)());
-
-inline bool Snappy_Compress(const char* input, size_t length,
- ::std::string* output) {
-#ifdef SNAPPY
- output->resize(snappy::MaxCompressedLength(length));
- size_t outlen;
- snappy::RawCompress(input, length, &(*output)[0], &outlen);
- output->resize(outlen);
- return true;
-#endif
-
- return false;
-}
-
-inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
- size_t* result) {
-#ifdef SNAPPY
- return snappy::GetUncompressedLength(input, length, result);
-#else
- return false;
-#endif
-}
-
-inline bool Snappy_Uncompress(const char* input, size_t length,
- char* output) {
-#ifdef SNAPPY
- return snappy::RawUncompress(input, length, output);
-#else
- return false;
-#endif
-}
-
-inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
- return false;
-}
-
-}
-}
-
-#endif // STORAGE_LEVELDB_PORT_PORT_WIN_H_
diff --git a/src/leveldb/port/thread_annotations.h b/src/leveldb/port/thread_annotations.h
new file mode 100644
index 000000000..6f9b6a792
--- /dev/null
+++ b/src/leveldb/port/thread_annotations.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H
+
+// Some environments provide custom macros to aid in static thread-safety
+// analysis. Provide empty definitions of such macros unless they are already
+// defined.
+
+#ifndef EXCLUSIVE_LOCKS_REQUIRED
+#define EXCLUSIVE_LOCKS_REQUIRED(...)
+#endif
+
+#ifndef SHARED_LOCKS_REQUIRED
+#define SHARED_LOCKS_REQUIRED(...)
+#endif
+
+#ifndef LOCKS_EXCLUDED
+#define LOCKS_EXCLUDED(...)
+#endif
+
+#ifndef LOCK_RETURNED
+#define LOCK_RETURNED(x)
+#endif
+
+#ifndef LOCKABLE
+#define LOCKABLE
+#endif
+
+#ifndef SCOPED_LOCKABLE
+#define SCOPED_LOCKABLE
+#endif
+
+#ifndef EXCLUSIVE_LOCK_FUNCTION
+#define EXCLUSIVE_LOCK_FUNCTION(...)
+#endif
+
+#ifndef SHARED_LOCK_FUNCTION
+#define SHARED_LOCK_FUNCTION(...)
+#endif
+
+#ifndef EXCLUSIVE_TRYLOCK_FUNCTION
+#define EXCLUSIVE_TRYLOCK_FUNCTION(...)
+#endif
+
+#ifndef SHARED_TRYLOCK_FUNCTION
+#define SHARED_TRYLOCK_FUNCTION(...)
+#endif
+
+#ifndef UNLOCK_FUNCTION
+#define UNLOCK_FUNCTION(...)
+#endif
+
+#ifndef NO_THREAD_SAFETY_ANALYSIS
+#define NO_THREAD_SAFETY_ANALYSIS
+#endif
+
+#endif // STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H
diff --git a/src/leveldb/table/block.cc b/src/leveldb/table/block.cc
index 199d45377..ab83c1112 100644
--- a/src/leveldb/table/block.cc
+++ b/src/leveldb/table/block.cc
@@ -1,267 +1,267 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Decodes the blocks generated by block_builder.cc.
#include "table/block.h"
#include <vector>
#include <algorithm>
#include "leveldb/comparator.h"
#include "table/format.h"
#include "util/coding.h"
#include "util/logging.h"
namespace leveldb {
inline uint32_t Block::NumRestarts() const {
assert(size_ >= 2*sizeof(uint32_t));
return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
}
Block::Block(const BlockContents& contents)
: data_(contents.data.data()),
size_(contents.data.size()),
owned_(contents.heap_allocated) {
if (size_ < sizeof(uint32_t)) {
size_ = 0; // Error marker
} else {
restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
if (restart_offset_ > size_ - sizeof(uint32_t)) {
// The size is too small for NumRestarts() and therefore
// restart_offset_ wrapped around.
size_ = 0;
}
}
}
Block::~Block() {
if (owned_) {
delete[] data_;
}
}
// Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes,
// and the length of the value in "*shared", "*non_shared", and
// "*value_length", respectively. Will not derefence past "limit".
//
// If any errors are detected, returns NULL. Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
static inline const char* DecodeEntry(const char* p, const char* limit,
uint32_t* shared,
uint32_t* non_shared,
uint32_t* value_length) {
if (limit - p < 3) return NULL;
*shared = reinterpret_cast<const unsigned char*>(p)[0];
*non_shared = reinterpret_cast<const unsigned char*>(p)[1];
*value_length = reinterpret_cast<const unsigned char*>(p)[2];
if ((*shared | *non_shared | *value_length) < 128) {
// Fast path: all three values are encoded in one byte each
p += 3;
} else {
if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL;
if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL;
if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL;
}
if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
return NULL;
}
return p;
}
class Block::Iter : public Iterator {
private:
const Comparator* const comparator_;
const char* const data_; // underlying block contents
uint32_t const restarts_; // Offset of restart array (list of fixed32)
uint32_t const num_restarts_; // Number of uint32_t entries in restart array
// current_ is offset in data_ of current entry. >= restarts_ if !Valid
uint32_t current_;
uint32_t restart_index_; // Index of restart block in which current_ falls
std::string key_;
Slice value_;
Status status_;
inline int Compare(const Slice& a, const Slice& b) const {
return comparator_->Compare(a, b);
}
// Return the offset in data_ just past the end of the current entry.
inline uint32_t NextEntryOffset() const {
return (value_.data() + value_.size()) - data_;
}
uint32_t GetRestartPoint(uint32_t index) {
assert(index < num_restarts_);
return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
}
void SeekToRestartPoint(uint32_t index) {
key_.clear();
restart_index_ = index;
// current_ will be fixed by ParseNextKey();
// ParseNextKey() starts at the end of value_, so set value_ accordingly
uint32_t offset = GetRestartPoint(index);
value_ = Slice(data_ + offset, 0);
}
public:
Iter(const Comparator* comparator,
const char* data,
uint32_t restarts,
uint32_t num_restarts)
: comparator_(comparator),
data_(data),
restarts_(restarts),
num_restarts_(num_restarts),
current_(restarts_),
restart_index_(num_restarts_) {
assert(num_restarts_ > 0);
}
virtual bool Valid() const { return current_ < restarts_; }
virtual Status status() const { return status_; }
virtual Slice key() const {
assert(Valid());
return key_;
}
virtual Slice value() const {
assert(Valid());
return value_;
}
virtual void Next() {
assert(Valid());
ParseNextKey();
}
virtual void Prev() {
assert(Valid());
// Scan backwards to a restart point before current_
const uint32_t original = current_;
while (GetRestartPoint(restart_index_) >= original) {
if (restart_index_ == 0) {
// No more entries
current_ = restarts_;
restart_index_ = num_restarts_;
return;
}
restart_index_--;
}
SeekToRestartPoint(restart_index_);
do {
// Loop until end of current entry hits the start of original entry
} while (ParseNextKey() && NextEntryOffset() < original);
}
virtual void Seek(const Slice& target) {
- // Binary search in restart array to find the first restart point
- // with a key >= target
+ // Binary search in restart array to find the last restart point
+ // with a key < target
uint32_t left = 0;
uint32_t right = num_restarts_ - 1;
while (left < right) {
uint32_t mid = (left + right + 1) / 2;
uint32_t region_offset = GetRestartPoint(mid);
uint32_t shared, non_shared, value_length;
const char* key_ptr = DecodeEntry(data_ + region_offset,
data_ + restarts_,
&shared, &non_shared, &value_length);
if (key_ptr == NULL || (shared != 0)) {
CorruptionError();
return;
}
Slice mid_key(key_ptr, non_shared);
if (Compare(mid_key, target) < 0) {
// Key at "mid" is smaller than "target". Therefore all
// blocks before "mid" are uninteresting.
left = mid;
} else {
// Key at "mid" is >= "target". Therefore all blocks at or
// after "mid" are uninteresting.
right = mid - 1;
}
}
// Linear search (within restart block) for first key >= target
SeekToRestartPoint(left);
while (true) {
if (!ParseNextKey()) {
return;
}
if (Compare(key_, target) >= 0) {
return;
}
}
}
virtual void SeekToFirst() {
SeekToRestartPoint(0);
ParseNextKey();
}
virtual void SeekToLast() {
SeekToRestartPoint(num_restarts_ - 1);
while (ParseNextKey() && NextEntryOffset() < restarts_) {
// Keep skipping
}
}
private:
void CorruptionError() {
current_ = restarts_;
restart_index_ = num_restarts_;
status_ = Status::Corruption("bad entry in block");
key_.clear();
value_.clear();
}
bool ParseNextKey() {
current_ = NextEntryOffset();
const char* p = data_ + current_;
const char* limit = data_ + restarts_; // Restarts come right after data
if (p >= limit) {
// No more entries to return. Mark as invalid.
current_ = restarts_;
restart_index_ = num_restarts_;
return false;
}
// Decode next entry
uint32_t shared, non_shared, value_length;
p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
if (p == NULL || key_.size() < shared) {
CorruptionError();
return false;
} else {
key_.resize(shared);
key_.append(p, non_shared);
value_ = Slice(p + non_shared, value_length);
while (restart_index_ + 1 < num_restarts_ &&
GetRestartPoint(restart_index_ + 1) < current_) {
++restart_index_;
}
return true;
}
}
};
Iterator* Block::NewIterator(const Comparator* cmp) {
if (size_ < 2*sizeof(uint32_t)) {
return NewErrorIterator(Status::Corruption("bad block contents"));
}
const uint32_t num_restarts = NumRestarts();
if (num_restarts == 0) {
return NewEmptyIterator();
} else {
return new Iter(cmp, data_, restart_offset_, num_restarts);
}
}
} // namespace leveldb
diff --git a/src/leveldb/util/bloom_test.cc b/src/leveldb/util/bloom_test.cc
index 4a6ea1b7c..0bf8e8d6e 100644
--- a/src/leveldb/util/bloom_test.cc
+++ b/src/leveldb/util/bloom_test.cc
@@ -1,159 +1,160 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/filter_policy.h"
+#include "util/coding.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
static const int kVerbose = 1;
static Slice Key(int i, char* buffer) {
- memcpy(buffer, &i, sizeof(i));
- return Slice(buffer, sizeof(i));
+ EncodeFixed32(buffer, i);
+ return Slice(buffer, sizeof(uint32_t));
}
class BloomTest {
private:
const FilterPolicy* policy_;
std::string filter_;
std::vector<std::string> keys_;
public:
BloomTest() : policy_(NewBloomFilterPolicy(10)) { }
~BloomTest() {
delete policy_;
}
void Reset() {
keys_.clear();
filter_.clear();
}
void Add(const Slice& s) {
keys_.push_back(s.ToString());
}
void Build() {
std::vector<Slice> key_slices;
for (size_t i = 0; i < keys_.size(); i++) {
key_slices.push_back(Slice(keys_[i]));
}
filter_.clear();
policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_);
keys_.clear();
if (kVerbose >= 2) DumpFilter();
}
size_t FilterSize() const {
return filter_.size();
}
void DumpFilter() {
fprintf(stderr, "F(");
for (size_t i = 0; i+1 < filter_.size(); i++) {
const unsigned int c = static_cast<unsigned int>(filter_[i]);
for (int j = 0; j < 8; j++) {
fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.');
}
}
fprintf(stderr, ")\n");
}
bool Matches(const Slice& s) {
if (!keys_.empty()) {
Build();
}
return policy_->KeyMayMatch(s, filter_);
}
double FalsePositiveRate() {
char buffer[sizeof(int)];
int result = 0;
for (int i = 0; i < 10000; i++) {
if (Matches(Key(i + 1000000000, buffer))) {
result++;
}
}
return result / 10000.0;
}
};
TEST(BloomTest, EmptyFilter) {
ASSERT_TRUE(! Matches("hello"));
ASSERT_TRUE(! Matches("world"));
}
TEST(BloomTest, Small) {
Add("hello");
Add("world");
ASSERT_TRUE(Matches("hello"));
ASSERT_TRUE(Matches("world"));
ASSERT_TRUE(! Matches("x"));
ASSERT_TRUE(! Matches("foo"));
}
static int NextLength(int length) {
if (length < 10) {
length += 1;
} else if (length < 100) {
length += 10;
} else if (length < 1000) {
length += 100;
} else {
length += 1000;
}
return length;
}
TEST(BloomTest, VaryingLengths) {
char buffer[sizeof(int)];
// Count number of filters that significantly exceed the false positive rate
int mediocre_filters = 0;
int good_filters = 0;
for (int length = 1; length <= 10000; length = NextLength(length)) {
Reset();
for (int i = 0; i < length; i++) {
Add(Key(i, buffer));
}
Build();
ASSERT_LE(FilterSize(), (length * 10 / 8) + 40) << length;
// All added keys must match
for (int i = 0; i < length; i++) {
ASSERT_TRUE(Matches(Key(i, buffer)))
<< "Length " << length << "; key " << i;
}
// Check false positive rate
double rate = FalsePositiveRate();
if (kVerbose >= 1) {
fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
rate*100.0, length, static_cast<int>(FilterSize()));
}
ASSERT_LE(rate, 0.02); // Must not be over 2%
if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often
else good_filters++;
}
if (kVerbose >= 1) {
fprintf(stderr, "Filters: %d good, %d mediocre\n",
good_filters, mediocre_filters);
}
ASSERT_LE(mediocre_filters, good_filters/5);
}
// Different bits-per-byte
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}
diff --git a/src/leveldb/util/coding.cc b/src/leveldb/util/coding.cc
index dbd7a6545..21e3186d5 100644
--- a/src/leveldb/util/coding.cc
+++ b/src/leveldb/util/coding.cc
@@ -1,194 +1,194 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/coding.h"
namespace leveldb {
void EncodeFixed32(char* buf, uint32_t value) {
-#if __BYTE_ORDER == __LITTLE_ENDIAN
- memcpy(buf, &value, sizeof(value));
-#else
- buf[0] = value & 0xff;
- buf[1] = (value >> 8) & 0xff;
- buf[2] = (value >> 16) & 0xff;
- buf[3] = (value >> 24) & 0xff;
-#endif
+ if (port::kLittleEndian) {
+ memcpy(buf, &value, sizeof(value));
+ } else {
+ buf[0] = value & 0xff;
+ buf[1] = (value >> 8) & 0xff;
+ buf[2] = (value >> 16) & 0xff;
+ buf[3] = (value >> 24) & 0xff;
+ }
}
void EncodeFixed64(char* buf, uint64_t value) {
-#if __BYTE_ORDER == __LITTLE_ENDIAN
- memcpy(buf, &value, sizeof(value));
-#else
- buf[0] = value & 0xff;
- buf[1] = (value >> 8) & 0xff;
- buf[2] = (value >> 16) & 0xff;
- buf[3] = (value >> 24) & 0xff;
- buf[4] = (value >> 32) & 0xff;
- buf[5] = (value >> 40) & 0xff;
- buf[6] = (value >> 48) & 0xff;
- buf[7] = (value >> 56) & 0xff;
-#endif
+ if (port::kLittleEndian) {
+ memcpy(buf, &value, sizeof(value));
+ } else {
+ buf[0] = value & 0xff;
+ buf[1] = (value >> 8) & 0xff;
+ buf[2] = (value >> 16) & 0xff;
+ buf[3] = (value >> 24) & 0xff;
+ buf[4] = (value >> 32) & 0xff;
+ buf[5] = (value >> 40) & 0xff;
+ buf[6] = (value >> 48) & 0xff;
+ buf[7] = (value >> 56) & 0xff;
+ }
}
void PutFixed32(std::string* dst, uint32_t value) {
char buf[sizeof(value)];
EncodeFixed32(buf, value);
dst->append(buf, sizeof(buf));
}
void PutFixed64(std::string* dst, uint64_t value) {
char buf[sizeof(value)];
EncodeFixed64(buf, value);
dst->append(buf, sizeof(buf));
}
char* EncodeVarint32(char* dst, uint32_t v) {
// Operate on characters as unsigneds
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
static const int B = 128;
if (v < (1<<7)) {
*(ptr++) = v;
} else if (v < (1<<14)) {
*(ptr++) = v | B;
*(ptr++) = v>>7;
} else if (v < (1<<21)) {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = v>>14;
} else if (v < (1<<28)) {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = (v>>14) | B;
*(ptr++) = v>>21;
} else {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = (v>>14) | B;
*(ptr++) = (v>>21) | B;
*(ptr++) = v>>28;
}
return reinterpret_cast<char*>(ptr);
}
void PutVarint32(std::string* dst, uint32_t v) {
char buf[5];
char* ptr = EncodeVarint32(buf, v);
dst->append(buf, ptr - buf);
}
char* EncodeVarint64(char* dst, uint64_t v) {
static const int B = 128;
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
while (v >= B) {
*(ptr++) = (v & (B-1)) | B;
v >>= 7;
}
*(ptr++) = static_cast<unsigned char>(v);
return reinterpret_cast<char*>(ptr);
}
void PutVarint64(std::string* dst, uint64_t v) {
char buf[10];
char* ptr = EncodeVarint64(buf, v);
dst->append(buf, ptr - buf);
}
void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
PutVarint32(dst, value.size());
dst->append(value.data(), value.size());
}
int VarintLength(uint64_t v) {
int len = 1;
while (v >= 128) {
v >>= 7;
len++;
}
return len;
}
const char* GetVarint32PtrFallback(const char* p,
const char* limit,
uint32_t* value) {
uint32_t result = 0;
for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
uint32_t byte = *(reinterpret_cast<const unsigned char*>(p));
p++;
if (byte & 128) {
// More bytes are present
result |= ((byte & 127) << shift);
} else {
result |= (byte << shift);
*value = result;
return reinterpret_cast<const char*>(p);
}
}
return NULL;
}
bool GetVarint32(Slice* input, uint32_t* value) {
const char* p = input->data();
const char* limit = p + input->size();
const char* q = GetVarint32Ptr(p, limit, value);
if (q == NULL) {
return false;
} else {
*input = Slice(q, limit - q);
return true;
}
}
const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
uint64_t result = 0;
for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
uint64_t byte = *(reinterpret_cast<const unsigned char*>(p));
p++;
if (byte & 128) {
// More bytes are present
result |= ((byte & 127) << shift);
} else {
result |= (byte << shift);
*value = result;
return reinterpret_cast<const char*>(p);
}
}
return NULL;
}
bool GetVarint64(Slice* input, uint64_t* value) {
const char* p = input->data();
const char* limit = p + input->size();
const char* q = GetVarint64Ptr(p, limit, value);
if (q == NULL) {
return false;
} else {
*input = Slice(q, limit - q);
return true;
}
}
const char* GetLengthPrefixedSlice(const char* p, const char* limit,
Slice* result) {
uint32_t len;
p = GetVarint32Ptr(p, limit, &len);
if (p == NULL) return NULL;
if (p + len > limit) return NULL;
*result = Slice(p, len);
return p + len;
}
bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
uint32_t len;
if (GetVarint32(input, &len) &&
input->size() >= len) {
*result = Slice(input->data(), len);
input->remove_prefix(len);
return true;
} else {
return false;
}
}
} // namespace leveldb
diff --git a/src/leveldb/util/env_boost.cc b/src/leveldb/util/env_boost.cc
deleted file mode 100644
index 055c65743..000000000
--- a/src/leveldb/util/env_boost.cc
+++ /dev/null
@@ -1,591 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include <deque>
-
-#ifdef LEVELDB_PLATFORM_WINDOWS
-#include <windows.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <time.h>
-#include <io.h>
-#else
-#include <dirent.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <sys/param.h>
-#include <time.h>
-#include <unistd.h>
-#endif
-#if defined(LEVELDB_PLATFORM_ANDROID)
-#include <sys/stat.h>
-#endif
-#include "leveldb/env.h"
-#include "leveldb/slice.h"
-
-#ifdef LEVELDB_PLATFORM_WINDOWS
-#include "util/win_logger.h"
-#else
-#include "util/posix_logger.h"
-#endif
-#include "port/port.h"
-#include "util/logging.h"
-
-#ifdef __linux
-#include <sys/sysinfo.h>
-#include <linux/unistd.h>
-#endif
-
-#include <fstream>
-
-// Boost includes - see WINDOWS file to see which modules to install
-#include <boost/date_time/gregorian/gregorian.hpp>
-#include <boost/date_time/posix_time/posix_time.hpp>
-#include <boost/filesystem/convenience.hpp>
-#include <boost/thread/once.hpp>
-#include <boost/thread/thread.hpp>
-#include <boost/bind.hpp>
-#include <boost/scoped_ptr.hpp>
-#include <boost/interprocess/sync/file_lock.hpp>
-#include <boost/thread/condition_variable.hpp>
-
-namespace leveldb {
-namespace {
-
-// returns the ID of the current process
-static boost::uint32_t current_process_id(void) {
-#ifdef _WIN32
- return static_cast<boost::uint32_t>(::GetCurrentProcessId());
-#else
- return static_cast<boost::uint32_t>(::getpid());
-#endif
-}
-
-// returns the ID of the current thread
-static boost::uint32_t current_thread_id(void) {
-#ifdef _WIN32
- return static_cast<boost::uint32_t>(::GetCurrentThreadId());
-#else
-#ifdef __linux
- return static_cast<boost::uint32_t>(::syscall(__NR_gettid));
-#else
- // just return the pid
- return current_process_id();
-#endif
-#endif
-}
-
-static char global_read_only_buf[0x8000];
-
-class PosixSequentialFile: public SequentialFile {
- private:
- std::string filename_;
- FILE* file_;
-
- public:
- PosixSequentialFile(const std::string& fname, FILE* f)
- : filename_(fname), file_(f) { }
- virtual ~PosixSequentialFile() { fclose(file_); }
-
- virtual Status Read(size_t n, Slice* result, char* scratch) {
- Status s;
-#if defined(BSD) || defined(__MINGW32__)
- // fread_unlocked doesn't exist on FreeBSD or MingW
- size_t r = fread(scratch, 1, n, file_);
-#else
- size_t r = fread_unlocked(scratch, 1, n, file_);
-#endif
- *result = Slice(scratch, r);
- if (r < n) {
- if (feof(file_)) {
- // We leave status as ok if we hit the end of the file
- } else {
- // A partial read with an error: return a non-ok status
- s = Status::IOError(filename_, strerror(errno));
- }
- }
- return s;
- }
-
- virtual Status Skip(uint64_t n) {
- if (fseek(file_, n, SEEK_CUR)) {
- return Status::IOError(filename_, strerror(errno));
- }
- return Status::OK();
- }
-};
-
-class PosixRandomAccessFile: public RandomAccessFile {
- private:
- std::string filename_;
- int fd_;
- mutable boost::mutex mu_;
-
- public:
- PosixRandomAccessFile(const std::string& fname, int fd)
- : filename_(fname), fd_(fd) { }
- virtual ~PosixRandomAccessFile() { close(fd_); }
-
- virtual Status Read(uint64_t offset, size_t n, Slice* result,
- char* scratch) const {
- Status s;
-#ifdef LEVELDB_PLATFORM_WINDOWS
- // no pread on Windows so we emulate it with a mutex
- boost::unique_lock<boost::mutex> lock(mu_);
-
- if (::_lseeki64(fd_, offset, SEEK_SET) == -1L) {
- return Status::IOError(filename_, strerror(errno));
- }
-
- int r = ::_read(fd_, scratch, n);
- *result = Slice(scratch, (r < 0) ? 0 : r);
- lock.unlock();
-#else
- ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
- *result = Slice(scratch, (r < 0) ? 0 : r);
-#endif
- if (r < 0) {
- // An error: return a non-ok status
- s = Status::IOError(filename_, strerror(errno));
- }
- return s;
- }
-};
-
-// We preallocate up to an extra megabyte and use memcpy to append new
-// data to the file. This is safe since we either properly close the
-// file before reading from it, or for log files, the reading code
-// knows enough to skip zero suffixes.
-
-class BoostFile : public WritableFile {
-
-public:
- explicit BoostFile(std::string path) : path_(path), written_(0) {
- Open();
- }
-
- virtual ~BoostFile() {
- Close();
- }
-
-private:
- void Open() {
- // we truncate the file as implemented in env_posix
- file_.open(path_.generic_string().c_str(),
- std::ios_base::trunc | std::ios_base::out | std::ios_base::binary);
- written_ = 0;
- }
-
-public:
- virtual Status Append(const Slice& data) {
- Status result;
- file_.write(data.data(), data.size());
- if (!file_.good()) {
- result = Status::IOError(
- path_.generic_string() + " Append", "cannot write");
- }
- return result;
- }
-
- virtual Status Close() {
- Status result;
-
- try {
- if (file_.is_open()) {
- Sync();
- file_.close();
- }
- } catch (const std::exception & e) {
- result = Status::IOError(path_.generic_string() + " close", e.what());
- }
-
- return result;
- }
-
- virtual Status Flush() {
- file_.flush();
- return Status::OK();
- }
-
- virtual Status Sync() {
- Status result;
- try {
- Flush();
- } catch (const std::exception & e) {
- result = Status::IOError(path_.string() + " sync", e.what());
- }
-
- return result;
- }
-
-private:
- boost::filesystem::path path_;
- boost::uint64_t written_;
- std::ofstream file_;
-};
-
-
-
-class BoostFileLock : public FileLock {
- public:
- boost::interprocess::file_lock fl_;
-};
-
-class PosixEnv : public Env {
- public:
- PosixEnv();
- virtual ~PosixEnv() {
- fprintf(stderr, "Destroying Env::Default()\n");
- exit(1);
- }
-
- virtual Status NewSequentialFile(const std::string& fname,
- SequentialFile** result) {
- FILE* f = fopen(fname.c_str(), "rb");
- if (f == NULL) {
- *result = NULL;
- return Status::IOError(fname, strerror(errno));
- } else {
- *result = new PosixSequentialFile(fname, f);
- return Status::OK();
- }
- }
-
- virtual Status NewRandomAccessFile(const std::string& fname,
- RandomAccessFile** result) {
-#ifdef LEVELDB_PLATFORM_WINDOWS
- int fd = _open(fname.c_str(), _O_RDONLY | _O_RANDOM | _O_BINARY);
-#else
- int fd = open(fname.c_str(), O_RDONLY);
-#endif
- if (fd < 0) {
- *result = NULL;
- return Status::IOError(fname, strerror(errno));
- }
- *result = new PosixRandomAccessFile(fname, fd);
- return Status::OK();
- }
-
- virtual Status NewWritableFile(const std::string& fname,
- WritableFile** result) {
- Status s;
- try {
- // will create a new empty file to write to
- *result = new BoostFile(fname);
- }
- catch (const std::exception & e) {
- s = Status::IOError(fname, e.what());
- }
-
- return s;
- }
-
- virtual bool FileExists(const std::string& fname) {
- return boost::filesystem::exists(fname);
- }
-
- virtual Status GetChildren(const std::string& dir,
- std::vector<std::string>* result) {
- result->clear();
-
- boost::system::error_code ec;
- boost::filesystem::directory_iterator current(dir, ec);
- if (ec != 0) {
- return Status::IOError(dir, ec.message());
- }
-
- boost::filesystem::directory_iterator end;
-
- for(; current != end; ++current) {
- result->push_back(current->path().filename().generic_string());
- }
-
- return Status::OK();
- }
-
- virtual Status DeleteFile(const std::string& fname) {
- boost::system::error_code ec;
-
- boost::filesystem::remove(fname, ec);
-
- Status result;
-
- if (ec != 0) {
- result = Status::IOError(fname, ec.message());
- }
-
- return result;
- }
-
- virtual Status CreateDir(const std::string& name) {
- Status result;
-
- if (boost::filesystem::exists(name) &&
- boost::filesystem::is_directory(name)) {
- return result;
- }
-
- boost::system::error_code ec;
-
- if (!boost::filesystem::create_directories(name, ec)) {
- result = Status::IOError(name, ec.message());
- }
-
- return result;
- };
-
- virtual Status DeleteDir(const std::string& name) {
- Status result;
-
- boost::system::error_code ec;
- if (!boost::filesystem::remove_all(name, ec)) {
- result = Status::IOError(name, ec.message());
- }
-
- return result;
- };
-
- virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
- boost::system::error_code ec;
-
- Status result;
-
- *size = static_cast<uint64_t>(boost::filesystem::file_size(fname, ec));
- if (ec != 0) {
- *size = 0;
- result = Status::IOError(fname, ec.message());
- }
-
- return result;
- }
-
- virtual Status RenameFile(const std::string& src, const std::string& target) {
- boost::system::error_code ec;
-
- boost::filesystem::rename(src, target, ec);
-
- Status result;
-
- if (ec != 0) {
- result = Status::IOError(src, ec.message());
- }
-
- return result;
- }
-
- virtual Status LockFile(const std::string& fname, FileLock** lock) {
- *lock = NULL;
-
- Status result;
-
- try {
- if (!boost::filesystem::exists(fname)) {
- std::ofstream of(fname.c_str(), std::ios_base::trunc | std::ios_base::out);
- }
-
- assert(boost::filesystem::exists(fname));
-
- boost::interprocess::file_lock fl(fname.c_str());
- BoostFileLock * my_lock = new BoostFileLock();
- fl.swap(my_lock->fl_);
- if (!my_lock->fl_.try_lock()) {
- return Status::IOError("database already in use: could not acquire exclusive lock");
- }
- *lock = my_lock;
- } catch (const std::exception & e) {
- result = Status::IOError("lock " + fname, e.what());
- }
-
- return result;
- }
-
- virtual Status UnlockFile(FileLock* lock) {
-
- Status result;
-
- try {
- BoostFileLock * my_lock = static_cast<BoostFileLock *>(lock);
- my_lock->fl_.unlock();
- delete my_lock;
- } catch (const std::exception & e) {
- result = Status::IOError("unlock", e.what());
- }
-
- return result;
- }
-
- virtual void Schedule(void (*function)(void*), void* arg);
-
- virtual void StartThread(void (*function)(void* arg), void* arg);
-
- virtual Status GetTestDirectory(std::string* result) {
- boost::system::error_code ec;
- boost::filesystem::path temp_dir =
- boost::filesystem::temp_directory_path(ec);
- if (ec != 0) {
- temp_dir = "tmp";
- }
-
- temp_dir /= "leveldb_tests";
- temp_dir /= boost::lexical_cast<std::string>(current_process_id());
-
- // Directory may already exist
- CreateDir(temp_dir.generic_string());
-
- *result = temp_dir.generic_string();
-
- return Status::OK();
- }
-
-#ifndef LEVELDB_PLATFORM_WINDOWS
- static uint64_t gettid() {
- pthread_t tid = pthread_self();
- uint64_t thread_id = 0;
- memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
- return thread_id;
- }
-#endif
-
- virtual Status NewLogger(const std::string& fname, Logger** result) {
- FILE* f = fopen(fname.c_str(), "wt");
- if (f == NULL) {
- *result = NULL;
- return Status::IOError(fname, strerror(errno));
- } else {
-#ifdef LEVELDB_PLATFORM_WINDOWS
- *result = new WinLogger(f);
-#else
- *result = new PosixLogger(f, &PosixEnv::gettid);
-#endif
- return Status::OK();
- }
- }
-
- virtual uint64_t NowMicros() {
- return static_cast<uint64_t>(
- boost::posix_time::microsec_clock::universal_time()
- .time_of_day().total_microseconds());
- }
-
- virtual void SleepForMicroseconds(int micros) {
- boost::this_thread::sleep(boost::posix_time::microseconds(micros));
- }
-
- private:
- void PthreadCall(const char* label, int result) {
- if (result != 0) {
- fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
- exit(1);
- }
- }
-
- // BGThread() is the body of the background thread
- void BGThread();
-
- static void* BGThreadWrapper(void* arg) {
- reinterpret_cast<PosixEnv*>(arg)->BGThread();
- return NULL;
- }
-
- boost::mutex mu_;
- boost::condition_variable bgsignal_;
- boost::scoped_ptr<boost::thread> bgthread_;
-
- // Entry per Schedule() call
- struct BGItem { void* arg; void (*function)(void*); };
- typedef std::deque<BGItem> BGQueue;
- BGQueue queue_;
-};
-
-PosixEnv::PosixEnv() { }
-
-void PosixEnv::Schedule(void (*function)(void*), void* arg) {
- boost::unique_lock<boost::mutex> lock(mu_);
-
- // Start background thread if necessary
- if (!bgthread_) {
- bgthread_.reset(
- new boost::thread(boost::bind(&PosixEnv::BGThreadWrapper, this)));
- }
-
- // Add to priority queue
- queue_.push_back(BGItem());
- queue_.back().function = function;
- queue_.back().arg = arg;
-
- lock.unlock();
-
- bgsignal_.notify_one();
-
-}
-
-void PosixEnv::BGThread() {
- while (true) {
- // Wait until there is an item that is ready to run
- boost::unique_lock<boost::mutex> lock(mu_);
-
- while (queue_.empty()) {
- bgsignal_.wait(lock);
- }
-
- void (*function)(void*) = queue_.front().function;
- void* arg = queue_.front().arg;
- queue_.pop_front();
-
- lock.unlock();
- (*function)(arg);
- }
-}
-
-namespace {
-struct StartThreadState {
- void (*user_function)(void*);
- void* arg;
-};
-}
-
-static void* StartThreadWrapper(void* arg) {
- StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
- state->user_function(state->arg);
- delete state;
- return NULL;
-}
-
-void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
- StartThreadState* state = new StartThreadState;
- state->user_function = function;
- state->arg = arg;
-
- boost::thread t(boost::bind(&StartThreadWrapper, state));
-}
-
-}
-
-static boost::once_flag once = BOOST_ONCE_INIT;
-static Env* default_env;
-static void InitDefaultEnv() {
- ::memset(global_read_only_buf, 0, sizeof(global_read_only_buf));
- default_env = new PosixEnv;
-}
-
-Env* Env::Default() {
- boost::call_once(once, InitDefaultEnv);
-
- return default_env;
-}
-
-}
diff --git a/src/leveldb/util/env_posix.cc b/src/leveldb/util/env_posix.cc
index cb1f6fc05..78e09c95c 100644
--- a/src/leveldb/util/env_posix.cc
+++ b/src/leveldb/util/env_posix.cc
@@ -1,609 +1,698 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <deque>
+#include <set>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#if defined(LEVELDB_PLATFORM_ANDROID)
#include <sys/stat.h>
#endif
#include "leveldb/env.h"
#include "leveldb/slice.h"
#include "port/port.h"
#include "util/logging.h"
+#include "util/mutexlock.h"
#include "util/posix_logger.h"
namespace leveldb {
namespace {
static Status IOError(const std::string& context, int err_number) {
return Status::IOError(context, strerror(err_number));
}
class PosixSequentialFile: public SequentialFile {
private:
std::string filename_;
FILE* file_;
public:
PosixSequentialFile(const std::string& fname, FILE* f)
: filename_(fname), file_(f) { }
virtual ~PosixSequentialFile() { fclose(file_); }
virtual Status Read(size_t n, Slice* result, char* scratch) {
Status s;
size_t r = fread_unlocked(scratch, 1, n, file_);
*result = Slice(scratch, r);
if (r < n) {
if (feof(file_)) {
// We leave status as ok if we hit the end of the file
} else {
// A partial read with an error: return a non-ok status
s = IOError(filename_, errno);
}
}
return s;
}
virtual Status Skip(uint64_t n) {
if (fseek(file_, n, SEEK_CUR)) {
return IOError(filename_, errno);
}
return Status::OK();
}
};
// pread() based random-access
class PosixRandomAccessFile: public RandomAccessFile {
private:
std::string filename_;
int fd_;
public:
PosixRandomAccessFile(const std::string& fname, int fd)
: filename_(fname), fd_(fd) { }
virtual ~PosixRandomAccessFile() { close(fd_); }
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
Status s;
ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
*result = Slice(scratch, (r < 0) ? 0 : r);
if (r < 0) {
// An error: return a non-ok status
s = IOError(filename_, errno);
}
return s;
}
};
+// Helper class to limit mmap file usage so that we do not end up
+// running out virtual memory or running into kernel performance
+// problems for very large databases.
+class MmapLimiter {
+ public:
+ // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes.
+ MmapLimiter() {
+ SetAllowed(sizeof(void*) >= 8 ? 1000 : 0);
+ }
+
+ // If another mmap slot is available, acquire it and return true.
+ // Else return false.
+ bool Acquire() {
+ if (GetAllowed() <= 0) {
+ return false;
+ }
+ MutexLock l(&mu_);
+ intptr_t x = GetAllowed();
+ if (x <= 0) {
+ return false;
+ } else {
+ SetAllowed(x - 1);
+ return true;
+ }
+ }
+
+ // Release a slot acquired by a previous call to Acquire() that returned true.
+ void Release() {
+ MutexLock l(&mu_);
+ SetAllowed(GetAllowed() + 1);
+ }
+
+ private:
+ port::Mutex mu_;
+ port::AtomicPointer allowed_;
+
+ intptr_t GetAllowed() const {
+ return reinterpret_cast<intptr_t>(allowed_.Acquire_Load());
+ }
+
+ // REQUIRES: mu_ must be held
+ void SetAllowed(intptr_t v) {
+ allowed_.Release_Store(reinterpret_cast<void*>(v));
+ }
+
+ MmapLimiter(const MmapLimiter&);
+ void operator=(const MmapLimiter&);
+};
+
// mmap() based random-access
class PosixMmapReadableFile: public RandomAccessFile {
private:
std::string filename_;
void* mmapped_region_;
size_t length_;
+ MmapLimiter* limiter_;
public:
// base[0,length-1] contains the mmapped contents of the file.
- PosixMmapReadableFile(const std::string& fname, void* base, size_t length)
- : filename_(fname), mmapped_region_(base), length_(length) { }
- virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); }
+ PosixMmapReadableFile(const std::string& fname, void* base, size_t length,
+ MmapLimiter* limiter)
+ : filename_(fname), mmapped_region_(base), length_(length),
+ limiter_(limiter) {
+ }
+
+ virtual ~PosixMmapReadableFile() {
+ munmap(mmapped_region_, length_);
+ limiter_->Release();
+ }
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
Status s;
if (offset + n > length_) {
*result = Slice();
s = IOError(filename_, EINVAL);
} else {
*result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
}
return s;
}
};
// We preallocate up to an extra megabyte and use memcpy to append new
// data to the file. This is safe since we either properly close the
// file before reading from it, or for log files, the reading code
// knows enough to skip zero suffixes.
class PosixMmapFile : public WritableFile {
private:
std::string filename_;
int fd_;
size_t page_size_;
size_t map_size_; // How much extra memory to map at a time
char* base_; // The mapped region
char* limit_; // Limit of the mapped region
char* dst_; // Where to write next (in range [base_,limit_])
char* last_sync_; // Where have we synced up to
uint64_t file_offset_; // Offset of base_ in file
// Have we done an munmap of unsynced data?
bool pending_sync_;
// Roundup x to a multiple of y
static size_t Roundup(size_t x, size_t y) {
return ((x + y - 1) / y) * y;
}
size_t TruncateToPageBoundary(size_t s) {
s -= (s & (page_size_ - 1));
assert((s % page_size_) == 0);
return s;
}
bool UnmapCurrentRegion() {
bool result = true;
if (base_ != NULL) {
if (last_sync_ < limit_) {
// Defer syncing this data until next Sync() call, if any
pending_sync_ = true;
}
if (munmap(base_, limit_ - base_) != 0) {
result = false;
}
file_offset_ += limit_ - base_;
base_ = NULL;
limit_ = NULL;
last_sync_ = NULL;
dst_ = NULL;
// Increase the amount we map the next time, but capped at 1MB
if (map_size_ < (1<<20)) {
map_size_ *= 2;
}
}
return result;
}
bool MapNewRegion() {
assert(base_ == NULL);
if (ftruncate(fd_, file_offset_ + map_size_) < 0) {
return false;
}
void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
fd_, file_offset_);
if (ptr == MAP_FAILED) {
return false;
}
base_ = reinterpret_cast<char*>(ptr);
limit_ = base_ + map_size_;
dst_ = base_;
last_sync_ = base_;
return true;
}
public:
PosixMmapFile(const std::string& fname, int fd, size_t page_size)
: filename_(fname),
fd_(fd),
page_size_(page_size),
map_size_(Roundup(65536, page_size)),
base_(NULL),
limit_(NULL),
dst_(NULL),
last_sync_(NULL),
file_offset_(0),
pending_sync_(false) {
assert((page_size & (page_size - 1)) == 0);
}
~PosixMmapFile() {
if (fd_ >= 0) {
PosixMmapFile::Close();
}
}
virtual Status Append(const Slice& data) {
const char* src = data.data();
size_t left = data.size();
while (left > 0) {
assert(base_ <= dst_);
assert(dst_ <= limit_);
size_t avail = limit_ - dst_;
if (avail == 0) {
if (!UnmapCurrentRegion() ||
!MapNewRegion()) {
return IOError(filename_, errno);
}
}
size_t n = (left <= avail) ? left : avail;
memcpy(dst_, src, n);
dst_ += n;
src += n;
left -= n;
}
return Status::OK();
}
virtual Status Close() {
Status s;
size_t unused = limit_ - dst_;
if (!UnmapCurrentRegion()) {
s = IOError(filename_, errno);
} else if (unused > 0) {
// Trim the extra space at the end of the file
if (ftruncate(fd_, file_offset_ - unused) < 0) {
s = IOError(filename_, errno);
}
}
if (close(fd_) < 0) {
if (s.ok()) {
s = IOError(filename_, errno);
}
}
fd_ = -1;
base_ = NULL;
limit_ = NULL;
return s;
}
virtual Status Flush() {
return Status::OK();
}
virtual Status Sync() {
Status s;
if (pending_sync_) {
// Some unmapped data was not synced
pending_sync_ = false;
if (fdatasync(fd_) < 0) {
s = IOError(filename_, errno);
}
}
if (dst_ > last_sync_) {
// Find the beginnings of the pages that contain the first and last
// bytes to be synced.
size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
last_sync_ = dst_;
if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
s = IOError(filename_, errno);
}
}
return s;
}
};
static int LockOrUnlock(int fd, bool lock) {
errno = 0;
struct flock f;
memset(&f, 0, sizeof(f));
f.l_type = (lock ? F_WRLCK : F_UNLCK);
f.l_whence = SEEK_SET;
f.l_start = 0;
f.l_len = 0; // Lock/unlock entire file
return fcntl(fd, F_SETLK, &f);
}
class PosixFileLock : public FileLock {
public:
int fd_;
+ std::string name_;
+};
+
+// Set of locked files. We keep a separate set instead of just
+// relying on fcntrl(F_SETLK) since fcntl(F_SETLK) does not provide
+// any protection against multiple uses from the same process.
+class PosixLockTable {
+ private:
+ port::Mutex mu_;
+ std::set<std::string> locked_files_;
+ public:
+ bool Insert(const std::string& fname) {
+ MutexLock l(&mu_);
+ return locked_files_.insert(fname).second;
+ }
+ void Remove(const std::string& fname) {
+ MutexLock l(&mu_);
+ locked_files_.erase(fname);
+ }
};
class PosixEnv : public Env {
public:
PosixEnv();
virtual ~PosixEnv() {
fprintf(stderr, "Destroying Env::Default()\n");
exit(1);
}
virtual Status NewSequentialFile(const std::string& fname,
SequentialFile** result) {
FILE* f = fopen(fname.c_str(), "r");
if (f == NULL) {
*result = NULL;
return IOError(fname, errno);
} else {
*result = new PosixSequentialFile(fname, f);
return Status::OK();
}
}
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result) {
*result = NULL;
Status s;
int fd = open(fname.c_str(), O_RDONLY);
if (fd < 0) {
s = IOError(fname, errno);
- } else if (sizeof(void*) >= 8) {
- // Use mmap when virtual address-space is plentiful.
+ } else if (mmap_limit_.Acquire()) {
uint64_t size;
s = GetFileSize(fname, &size);
if (s.ok()) {
void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
if (base != MAP_FAILED) {
- *result = new PosixMmapReadableFile(fname, base, size);
+ *result = new PosixMmapReadableFile(fname, base, size, &mmap_limit_);
} else {
s = IOError(fname, errno);
}
}
close(fd);
+ if (!s.ok()) {
+ mmap_limit_.Release();
+ }
} else {
*result = new PosixRandomAccessFile(fname, fd);
}
return s;
}
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) {
Status s;
const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
if (fd < 0) {
*result = NULL;
s = IOError(fname, errno);
} else {
*result = new PosixMmapFile(fname, fd, page_size_);
}
return s;
}
virtual bool FileExists(const std::string& fname) {
return access(fname.c_str(), F_OK) == 0;
}
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) {
result->clear();
DIR* d = opendir(dir.c_str());
if (d == NULL) {
return IOError(dir, errno);
}
struct dirent* entry;
while ((entry = readdir(d)) != NULL) {
result->push_back(entry->d_name);
}
closedir(d);
return Status::OK();
}
virtual Status DeleteFile(const std::string& fname) {
Status result;
if (unlink(fname.c_str()) != 0) {
result = IOError(fname, errno);
}
return result;
};
virtual Status CreateDir(const std::string& name) {
Status result;
if (mkdir(name.c_str(), 0755) != 0) {
result = IOError(name, errno);
}
return result;
};
virtual Status DeleteDir(const std::string& name) {
Status result;
if (rmdir(name.c_str()) != 0) {
result = IOError(name, errno);
}
return result;
};
virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
Status s;
struct stat sbuf;
if (stat(fname.c_str(), &sbuf) != 0) {
*size = 0;
s = IOError(fname, errno);
} else {
*size = sbuf.st_size;
}
return s;
}
virtual Status RenameFile(const std::string& src, const std::string& target) {
Status result;
if (rename(src.c_str(), target.c_str()) != 0) {
result = IOError(src, errno);
}
return result;
}
virtual Status LockFile(const std::string& fname, FileLock** lock) {
*lock = NULL;
Status result;
int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
if (fd < 0) {
result = IOError(fname, errno);
+ } else if (!locks_.Insert(fname)) {
+ close(fd);
+ result = Status::IOError("lock " + fname, "already held by process");
} else if (LockOrUnlock(fd, true) == -1) {
result = IOError("lock " + fname, errno);
close(fd);
+ locks_.Remove(fname);
} else {
PosixFileLock* my_lock = new PosixFileLock;
my_lock->fd_ = fd;
+ my_lock->name_ = fname;
*lock = my_lock;
}
return result;
}
virtual Status UnlockFile(FileLock* lock) {
PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
Status result;
if (LockOrUnlock(my_lock->fd_, false) == -1) {
result = IOError("unlock", errno);
}
+ locks_.Remove(my_lock->name_);
close(my_lock->fd_);
delete my_lock;
return result;
}
virtual void Schedule(void (*function)(void*), void* arg);
virtual void StartThread(void (*function)(void* arg), void* arg);
virtual Status GetTestDirectory(std::string* result) {
const char* env = getenv("TEST_TMPDIR");
if (env && env[0] != '\0') {
*result = env;
} else {
char buf[100];
snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid()));
*result = buf;
}
// Directory may already exist
CreateDir(*result);
return Status::OK();
}
static uint64_t gettid() {
pthread_t tid = pthread_self();
uint64_t thread_id = 0;
memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
return thread_id;
}
virtual Status NewLogger(const std::string& fname, Logger** result) {
FILE* f = fopen(fname.c_str(), "w");
if (f == NULL) {
*result = NULL;
return IOError(fname, errno);
} else {
*result = new PosixLogger(f, &PosixEnv::gettid);
return Status::OK();
}
}
virtual uint64_t NowMicros() {
struct timeval tv;
gettimeofday(&tv, NULL);
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
virtual void SleepForMicroseconds(int micros) {
usleep(micros);
}
private:
void PthreadCall(const char* label, int result) {
if (result != 0) {
fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
exit(1);
}
}
// BGThread() is the body of the background thread
void BGThread();
static void* BGThreadWrapper(void* arg) {
reinterpret_cast<PosixEnv*>(arg)->BGThread();
return NULL;
}
size_t page_size_;
pthread_mutex_t mu_;
pthread_cond_t bgsignal_;
pthread_t bgthread_;
bool started_bgthread_;
// Entry per Schedule() call
struct BGItem { void* arg; void (*function)(void*); };
typedef std::deque<BGItem> BGQueue;
BGQueue queue_;
+
+ PosixLockTable locks_;
+ MmapLimiter mmap_limit_;
};
PosixEnv::PosixEnv() : page_size_(getpagesize()),
started_bgthread_(false) {
PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL));
PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL));
}
void PosixEnv::Schedule(void (*function)(void*), void* arg) {
PthreadCall("lock", pthread_mutex_lock(&mu_));
// Start background thread if necessary
if (!started_bgthread_) {
started_bgthread_ = true;
PthreadCall(
"create thread",
pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this));
}
// If the queue is currently empty, the background thread may currently be
// waiting.
if (queue_.empty()) {
PthreadCall("signal", pthread_cond_signal(&bgsignal_));
}
// Add to priority queue
queue_.push_back(BGItem());
queue_.back().function = function;
queue_.back().arg = arg;
PthreadCall("unlock", pthread_mutex_unlock(&mu_));
}
void PosixEnv::BGThread() {
while (true) {
// Wait until there is an item that is ready to run
PthreadCall("lock", pthread_mutex_lock(&mu_));
while (queue_.empty()) {
PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
}
void (*function)(void*) = queue_.front().function;
void* arg = queue_.front().arg;
queue_.pop_front();
PthreadCall("unlock", pthread_mutex_unlock(&mu_));
(*function)(arg);
}
}
namespace {
struct StartThreadState {
void (*user_function)(void*);
void* arg;
};
}
static void* StartThreadWrapper(void* arg) {
StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
state->user_function(state->arg);
delete state;
return NULL;
}
void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
pthread_t t;
StartThreadState* state = new StartThreadState;
state->user_function = function;
state->arg = arg;
PthreadCall("start thread",
pthread_create(&t, NULL, &StartThreadWrapper, state));
}
} // namespace
static pthread_once_t once = PTHREAD_ONCE_INIT;
static Env* default_env;
static void InitDefaultEnv() { default_env = new PosixEnv; }
Env* Env::Default() {
pthread_once(&once, InitDefaultEnv);
return default_env;
}
} // namespace leveldb
diff --git a/src/leveldb/util/mutexlock.h b/src/leveldb/util/mutexlock.h
index c3f3306d3..1ff5a9efa 100644
--- a/src/leveldb/util/mutexlock.h
+++ b/src/leveldb/util/mutexlock.h
@@ -1,39 +1,41 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_
#define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_
#include "port/port.h"
+#include "port/thread_annotations.h"
namespace leveldb {
// Helper class that locks a mutex on construction and unlocks the mutex when
// the destructor of the MutexLock object is invoked.
//
// Typical usage:
//
// void MyClass::MyMethod() {
// MutexLock l(&mu_); // mu_ is an instance variable
// ... some complex code, possibly with multiple return paths ...
// }
-class MutexLock {
+class SCOPED_LOCKABLE MutexLock {
public:
- explicit MutexLock(port::Mutex *mu) : mu_(mu) {
+ explicit MutexLock(port::Mutex *mu) EXCLUSIVE_LOCK_FUNCTION(mu)
+ : mu_(mu) {
this->mu_->Lock();
}
- ~MutexLock() { this->mu_->Unlock(); }
+ ~MutexLock() UNLOCK_FUNCTION() { this->mu_->Unlock(); }
private:
port::Mutex *const mu_;
// No copying allowed
MutexLock(const MutexLock&);
void operator=(const MutexLock&);
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_
diff --git a/src/leveldb/util/win_logger.cc b/src/leveldb/util/win_logger.cc
deleted file mode 100644
index 834c98cc7..000000000
--- a/src/leveldb/util/win_logger.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "util/win_logger.h"
-
-#include <windows.h>
-
-namespace leveldb {
-
-void WinLogger::Logv(const char* format, va_list ap) {
- const uint64_t thread_id = static_cast<uint64_t>(::GetCurrentThreadId());
-
- // We try twice: the first time with a fixed-size stack allocated buffer,
- // and the second time with a much larger dynamically allocated buffer.
- char buffer[500];
-
- for (int iter = 0; iter < 2; iter++) {
- char* base;
- int bufsize;
- if (iter == 0) {
- bufsize = sizeof(buffer);
- base = buffer;
- } else {
- bufsize = 30000;
- base = new char[bufsize];
- }
-
- char* p = base;
- char* limit = base + bufsize;
-
- SYSTEMTIME st;
-
- // GetSystemTime returns UTC time, we want local time!
- ::GetLocalTime(&st);
-
-#ifdef _MSC_VER
- p += _snprintf_s(p, limit - p, _TRUNCATE,
- "%04d/%02d/%02d-%02d:%02d:%02d.%03d %llx ",
- st.wYear,
- st.wMonth,
- st.wDay,
- st.wHour,
- st.wMinute,
- st.wSecond,
- st.wMilliseconds,
- static_cast<long long unsigned int>(thread_id));
-#else
-#ifdef __MINGW32__
- p += snprintf(p, limit - p,
- "%04d/%02d/%02d-%02d:%02d:%02d.%03d %llx ",
- st.wYear,
- st.wMonth,
- st.wDay,
- st.wHour,
- st.wMinute,
- st.wSecond,
- st.wMilliseconds,
- static_cast<long long unsigned int>(thread_id));
-#else
-#error Unable to detect Windows compiler (neither _MSC_VER nor __MINGW32__ are set)
-#endif
-#endif
-
- // Print the message
- if (p < limit) {
- va_list backup_ap = ap;
- p += vsnprintf(p, limit - p, format, backup_ap);
- va_end(backup_ap);
- }
-
- // Truncate to available space if necessary
- if (p >= limit) {
- if (iter == 0) {
- continue; // Try again with larger buffer
- } else {
- p = limit - 1;
- }
- }
-
- // Add newline if necessary
- if (p == base || p[-1] != '\n') {
- *p++ = '\n';
- }
-
- assert(p <= limit);
- fwrite(base, 1, p - base, file_);
- fflush(file_);
- if (base != buffer) {
- delete[] base;
- }
- break;
- }
-}
-
-}
\ No newline at end of file
diff --git a/src/leveldb/util/win_logger.h b/src/leveldb/util/win_logger.h
deleted file mode 100644
index b155d5c31..000000000
--- a/src/leveldb/util/win_logger.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// Logger implementation for Windows
-
-#ifndef STORAGE_LEVELDB_UTIL_WIN_LOGGER_H_
-#define STORAGE_LEVELDB_UTIL_WIN_LOGGER_H_
-
-#include <stdio.h>
-#include "leveldb/env.h"
-
-namespace leveldb {
-
-class WinLogger : public Logger {
- private:
- FILE* file_;
- public:
- explicit WinLogger(FILE* f) : file_(f) { assert(file_); }
- virtual ~WinLogger() {
- fclose(file_);
- }
- virtual void Logv(const char* format, va_list ap);
-
-};
-
-}
-#endif // STORAGE_LEVELDB_UTIL_WIN_LOGGER_H_

File Metadata

Mime Type
text/x-diff
Expires
Sun, Mar 2, 11:30 (1 d, 5 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
5187620
Default Alt Text
(417 KB)

Event Timeline