diff --git a/bench/latency/Makefile b/bench/latency/Makefile
new file mode 100644
index 00000000..a8950808
--- /dev/null
+++ b/bench/latency/Makefile
@@ -0,0 +1,28 @@
+CXX	?= g++
+CXXFLAGS	= -march=native -Wall -std=c++11 -pthread 
+CXX_RELEASE_FLAGS = -Ofast -DNDEBUG
+
+
+binaries=spdlog-latency g3log-latency
+
+all: $(binaries)
+
+spdlog-latency: spdlog-latency.cpp
+	$(CXX) spdlog-latency.cpp -o spdlog-latency $(CXXFLAGS) $(CXX_RELEASE_FLAGS) -I../../include
+
+
+
+g3log-latency: g3log-latency.cpp
+	$(CXX) g3log-latency.cpp -o g3log-latency $(CXXFLAGS) $(CXX_RELEASE_FLAGS) -I../../../g3log/src -L. -lg3logger 
+		
+	
+.PHONY: clean
+
+clean:
+	rm -f *.o  *.log $(binaries)
+
+
+rebuild: clean all
+
+
+
diff --git a/bench/latency/compare.sh b/bench/latency/compare.sh
new file mode 100755
index 00000000..0f0e4c97
--- /dev/null
+++ b/bench/latency/compare.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+echo "running spdlog and g3log tests 10 time with ${1:-10} threads each (total 1,000,000 entries).."
+rm -f *.log
+for i in {1..10}
+
+do
+   echo
+   sleep 0.5
+   ./spdlog-latency ${1:-10} 2>/dev/null || exit
+   sleep 0.5
+   ./g3log-latency ${1:-10} 2>/dev/null || exit
+
+done
diff --git a/bench/latency/g3log-latency.cpp b/bench/latency/g3log-latency.cpp
new file mode 100644
index 00000000..b3c92b49
--- /dev/null
+++ b/bench/latency/g3log-latency.cpp
@@ -0,0 +1,143 @@
+//// to compile:   c++ bench.cpp -o bench -Wall -Wshadow -Wextra -pedantic -std=c++11 -pthread -I../include -O3 -fPIC -Ofast -m64 -march=native
+
+// Alternative:   c++ bench.cpp -o bench -Wall -Wshadow -Wextra -pedantic -std=c++11 -pthread -I../include -O3  -march=native
+
+
+// the test code itself is Public domain @ref: Unlicense.org
+// made by KjellKod, 2015, first published for testing of g3log at github.com/kjellkod/g3log
+// Feel free to share, modify etc with no obligations but also with no guarantees from my part either
+// enjoy - Kjell Hedstrom (aka KjellKod)
+//
+//
+// spdlog follows however another license. See the bottow of this file
+//
+
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <iostream>
+#include <chrono>
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <cstdio>
+#include <map>
+#include <numeric>
+#include <functional>
+#include <thread>
+#include "utils.h"
+#include <g3log/g3log.hpp>
+#include <g3log/logworker.hpp>
+
+
+namespace
+{
+const uint64_t g_iterations = 1000000;
+
+
+std::atomic<size_t> g_counter = {0};
+
+
+void MeasurePeakDuringLogWrites(const size_t id, std::vector<uint64_t>& result)
+{
+
+    while (true)
+    {
+        const size_t value_now = ++g_counter;
+        if (value_now > g_iterations)
+        {
+            return;
+        }
+
+        auto start_time = std::chrono::high_resolution_clock::now();
+		LOGF(INFO, "Some text to log for thread: %ld", id);        
+        auto stop_time = std::chrono::high_resolution_clock::now();
+        uint64_t time_us = std::chrono::duration_cast<std::chrono::microseconds>(stop_time - start_time).count();
+        result.push_back(time_us);
+    }
+}
+
+
+
+void PrintResults(const std::map<size_t, std::vector<uint64_t>>& threads_result, size_t total_us)
+{
+
+    std::vector<uint64_t> all_measurements;
+    all_measurements.reserve(g_iterations);
+    for (auto& t_result : threads_result)
+    {
+        all_measurements.insert(all_measurements.end(), t_result.second.begin(), t_result.second.end());
+    }
+
+    // calc worst latenct
+    auto worst = *std::max_element(all_measurements.begin(), all_measurements.end());
+    
+    // calc avg
+    auto total = accumulate(begin(all_measurements), end(all_measurements), 0, std::plus<uint64_t>());
+    auto avg = double(total)/all_measurements.size();
+
+    std::cout << "[g3log]  worst: " <<  std::setw(10) << std::right << worst << "\tAvg: "  << avg << "\tTotal: "  <<  utils::format(total_us) << " us" << std::endl;
+  
+}
+}// anonymous
+
+
+// The purpose of this test is NOT to see how fast
+// each thread can possibly write. It is to see what
+// the worst latency is for writing a log entry
+//
+// In the test 1 million log entries will be written
+// an atomic counter is used to give each thread what
+// it is to write next. The overhead of atomic
+// synchronization between the threads are not counted in the worst case latency
+int main(int argc, char** argv)
+{
+    size_t number_of_threads {0};
+    if (argc == 2)
+    {
+        number_of_threads = atoi(argv[1]);
+    }
+    if (argc != 2 || number_of_threads == 0)
+    {
+        std::cerr << "USAGE is: " << argv[0] << " number_threads" << std::endl;
+        return 1;
+    }
+
+
+    std::vector<std::thread> threads(number_of_threads);
+    std::map<size_t, std::vector<uint64_t>> threads_result;
+
+    for (size_t idx = 0; idx < number_of_threads; ++idx)
+    {
+        // reserve to 1 million for all the result
+        // it's a test so  let's not care about the wasted space
+        threads_result[idx].reserve(g_iterations);
+    }
+
+    const std::string g_path = "./" ;
+    const std::string  g_prefix_log_name = "g3log-performance-";
+    const std::string  g_measurement_dump = g_path + g_prefix_log_name + "_RESULT.txt";
+
+	auto worker = g3::LogWorker::createLogWorker();
+    auto handle= worker->addDefaultLogger(argv[0], "g3log.txt");
+    g3::initializeLogging(worker.get());
+    
+    auto start_time_application_total = std::chrono::high_resolution_clock::now();
+    for (uint64_t idx = 0; idx < number_of_threads; ++idx)
+    {
+        threads[idx] = std::thread(MeasurePeakDuringLogWrites, idx, std::ref(threads_result[idx]));
+    }
+    for (size_t idx = 0; idx < number_of_threads; ++idx)
+    {
+        threads[idx].join();
+    }
+    auto stop_time_application_total = std::chrono::high_resolution_clock::now();
+
+    uint64_t total_time_in_us = std::chrono::duration_cast<std::chrono::microseconds>(stop_time_application_total - start_time_application_total).count();
+    PrintResults(threads_result, total_time_in_us);
+    return 0;
+}
+
+
diff --git a/bench/latency/spdlog-latency.cpp b/bench/latency/spdlog-latency.cpp
new file mode 100644
index 00000000..5480264c
--- /dev/null
+++ b/bench/latency/spdlog-latency.cpp
@@ -0,0 +1,141 @@
+//// to compile:   c++ bench.cpp -o bench -Wall -Wshadow -Wextra -pedantic -std=c++11 -pthread -I../include -O3 -fPIC -Ofast -m64 -march=native
+
+// Alternative:   c++ bench.cpp -o bench -Wall -Wshadow -Wextra -pedantic -std=c++11 -pthread -I../include -O3  -march=native
+
+
+// the test code itself is Public domain @ref: Unlicense.org
+// made by KjellKod, 2015, first published for testing of g3log at github.com/kjellkod/g3log
+// Feel free to share, modify etc with no obligations but also with no guarantees from my part either
+// enjoy - Kjell Hedstrom (aka KjellKod)
+//
+//
+// spdlog follows however another license. See the bottow of this file
+//
+
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <iostream>
+#include <chrono>
+#include <algorithm>
+#include <iostream>
+#include <cstdio>
+#include <map>
+#include <numeric>
+#include <functional>
+#include "utils.h"
+#include <thread>
+
+#include "spdlog/spdlog.h"
+
+namespace spd = spdlog;
+
+namespace
+{
+const uint64_t g_iterations = 1000000;
+
+
+std::atomic<size_t> g_counter = {0};
+
+
+void MeasurePeakDuringLogWrites(const size_t id, std::vector<uint64_t>& result)
+{
+	auto logger = spd::get("file_logger");
+    while (true)
+    {
+        const size_t value_now = ++g_counter;
+        if (value_now > g_iterations)
+        {
+            return;
+        }
+
+        auto start_time = std::chrono::high_resolution_clock::now();
+        logger->info("Some text to log for thread: [somemore text...............................] {}", id);
+        auto stop_time = std::chrono::high_resolution_clock::now();
+        uint64_t time_us = std::chrono::duration_cast<std::chrono::microseconds>(stop_time - start_time).count();
+        result.push_back(time_us);
+    }
+}
+
+
+void PrintResults(const std::map<size_t, std::vector<uint64_t>>& threads_result, size_t total_us)
+{
+
+    std::vector<uint64_t> all_measurements;
+    all_measurements.reserve(g_iterations);
+    for (auto& t_result : threads_result)
+    {
+        all_measurements.insert(all_measurements.end(), t_result.second.begin(), t_result.second.end());
+    }
+
+    // calc worst latenct
+    auto worst = *std::max_element(all_measurements.begin(), all_measurements.end());
+    
+    // calc avg
+    auto total = accumulate(begin(all_measurements), end(all_measurements), 0, std::plus<uint64_t>());
+    auto avg = double(total)/all_measurements.size();
+	
+    std::cout << "[spdlog] worst: " <<  std::setw(10) << std::right << worst << "\tAvg: "  << avg << "\tTotal: "  <<  utils::format(total_us) << " us" << std::endl;	
+  
+}
+}// anonymous
+
+
+// The purpose of this test is NOT to see how fast
+// each thread can possibly write. It is to see what
+// the worst latency is for writing a log entry
+//
+// In the test 1 million log entries will be written
+// an atomic counter is used to give each thread what
+// it is to write next. The overhead of atomic
+// synchronization between the threads are not counted in the worst case latency
+int main(int argc, char** argv)
+{
+    size_t number_of_threads {0};
+    if (argc == 2)
+    {
+        number_of_threads = atoi(argv[1]);
+    }
+    if (argc != 2 || number_of_threads == 0)
+    {
+        std::cerr << "usage: " << argv[0] << " number_threads" << std::endl;
+        return 1;
+    }
+
+
+    std::vector<std::thread> threads(number_of_threads);
+    std::map<size_t, std::vector<uint64_t>> threads_result;
+
+    for (size_t idx = 0; idx < number_of_threads; ++idx)
+    {
+        // reserve to 1 million for all the result
+        // it's a test so  let's not care about the wasted space
+        threads_result[idx].reserve(g_iterations);
+    }
+
+    int queue_size = 1048576; // 2 ^ 20
+    spdlog::set_async_mode(queue_size); 	
+    auto logger = spdlog::create<spd::sinks::simple_file_sink_mt>("file_logger", "spdlog.log", true);	
+	
+	//force flush on every call to compare with g3log
+	auto s = (spd::sinks::simple_file_sink_mt*)logger->sinks()[0].get();
+	s->set_force_flush(true);
+	
+    auto start_time_application_total = std::chrono::high_resolution_clock::now();
+    for (uint64_t idx = 0; idx < number_of_threads; ++idx)
+    {
+        threads[idx] = std::thread(MeasurePeakDuringLogWrites, idx, std::ref(threads_result[idx]));
+    }
+    for (size_t idx = 0; idx < number_of_threads; ++idx)
+    {
+        threads[idx].join();
+    }
+    auto stop_time_application_total = std::chrono::high_resolution_clock::now();
+
+    uint64_t total_time_in_us = std::chrono::duration_cast<std::chrono::microseconds>(stop_time_application_total - start_time_application_total).count();
+
+    PrintResults(threads_result, total_time_in_us);
+    return 0;
+}
+
+
diff --git a/bench/latency/utils.h b/bench/latency/utils.h
new file mode 100644
index 00000000..b260f724
--- /dev/null
+++ b/bench/latency/utils.h
@@ -0,0 +1,35 @@
+//
+// Copyright(c) 2015 Gabi Melman.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+//
+
+#pragma once
+
+#include <sstream>
+#include <iomanip>
+#include <locale>
+
+namespace utils
+{
+
+template<typename T>
+inline std::string format(const T& value)
+{
+    static std::locale loc("");
+    std::stringstream ss;
+    ss.imbue(loc);
+    ss << value;
+    return ss.str();
+}
+
+template<>
+inline std::string format(const double & value)
+{
+    static std::locale loc("");
+    std::stringstream ss;
+    ss.imbue(loc);
+    ss << std::fixed << std::setprecision(1) << value;
+    return ss.str();
+}
+
+}