blob: 203bf24b5ae959e3aa06aeef8117f85ecbaf57d2 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "session/session_watch_dog.h"
#include <algorithm>
#include <cstring>
#include <numeric>
#include <string>
#include "base/cpu_stats.h"
#include "base/logging.h"
#include "base/port.h"
#include "base/scoped_ptr.h"
#include "base/system_util.h"
#include "base/unnamed_event.h"
#include "base/util.h"
#include "client/client_interface.h"
namespace mozc {
namespace {
// IPC timeout
const int32 kCleanupTimeout = 30 * 1000; // 30 sec for Cleanup Command
const int32 kPingTimeout = 5 * 1000; // 5 sec for Ping
// number of trials for ping
const int32 kPingTrial = 3;
const int32 kPingInterval = 1000;
// Average CPU load for last 1min.
// If the load > kMinimumAllCPULoad, don't send Cleanup
const float kMinimumAllCPULoad = 0.33f;
// Average CPU load for last 10secs.
// If the load > kMinimumLatestCPULoad, don't send Cleanup
const float kMinimumLatestCPULoad = 0.66f;
} // namespace
SessionWatchDog::SessionWatchDog(int32 interval_sec)
: interval_sec_(interval_sec),
client_(NULL), cpu_stats_(NULL), event_(new UnnamedEvent) {
// allow [1..600].
interval_sec_ = max(1, min(interval_sec_, 600));
DCHECK(event_->IsAvailable())
<< "Unnamed event is not available";
}
SessionWatchDog::~SessionWatchDog() {
Terminate();
}
void SessionWatchDog::SetClientInterface(client::ClientInterface *client) {
client_ = client;
}
void SessionWatchDog::SetCPUStatsInterface(CPUStatsInterface *cpu_stats) {
cpu_stats_ = cpu_stats;
}
void SessionWatchDog::Terminate() {
if (!IsRunning()) {
return;
}
if (!event_->Notify()) {
LOG(ERROR) << "UnnamedEvent::Notify() failed";
Thread::Terminate();
}
Join();
}
void SessionWatchDog::Run() {
scoped_ptr<client::ClientInterface> client_impl;
if (client_ == NULL) {
VLOG(2) << "default client is used";
client_impl.reset(client::ClientFactory::NewClient());
client_ = client_impl.get();
}
scoped_ptr<CPUStatsInterface> cpu_stats_impl;
if (cpu_stats_ == NULL) {
VLOG(2) << "default cpu_stats is used";
cpu_stats_impl.reset(new CPUStats);
cpu_stats_ = cpu_stats_impl.get();
}
if (!event_->IsAvailable()) {
LOG(ERROR) << "Unnamed event is not available";
return;
}
// CPU load check
// add volatile to store this array in stack
volatile float cpu_loads[16]; // 60/5 = 12 is the minimal size
volatile float total_cpu_load = 0.0;
volatile float current_process_cpu_load = 0.0;
const volatile size_t number_of_processors =
cpu_stats_->GetNumberOfProcessors();
DCHECK_GE(number_of_processors, 1);
// the first (interval_sec_ - 60) sec: -> Do nothing
const int32 idle_interval_msec = max(0, (interval_sec_ - 60)) * 1000;
// last 60 sec: -> check CPU usage
const int32 cpu_check_interval_msec = min(60, interval_sec_) * 1000;
// for every 5 second, get CPU load percentage
const int32 cpu_check_duration_msec = min(5, interval_sec_) * 1000;
fill(cpu_loads, cpu_loads + arraysize(cpu_loads), 0.0);
uint64 last_cleanup_time = Util::GetTime();
while (true) {
VLOG(1) << "Start sleeping " << idle_interval_msec;
if (event_->Wait(idle_interval_msec)) {
VLOG(1) << "Received stop signal";
return;
}
VLOG(1) << "Finish sleeping " << idle_interval_msec;
int32 cpu_loads_index = 0;
for (int n = 0; n < cpu_check_interval_msec;
n += cpu_check_duration_msec) {
if (event_->Wait(cpu_check_duration_msec)) {
VLOG(1) << "Received stop signal";
return;
}
// save them in stack for debugging
total_cpu_load = cpu_stats_->GetSystemCPULoad();
current_process_cpu_load = cpu_stats_->GetCurrentProcessCPULoad();
VLOG(1) << "total=" << total_cpu_load
<< " current=" << current_process_cpu_load
<< " normalized_current="
<< current_process_cpu_load / number_of_processors;
// subtract the CPU load of my process from total CPU load.
// This is required for running stress test.
const float extracted_cpu_load =
total_cpu_load - current_process_cpu_load / number_of_processors;
cpu_loads[cpu_loads_index++] = max(0.0f, extracted_cpu_load);
}
DCHECK_GT(cpu_loads_index, 0);
const uint64 current_cleanup_time = Util::GetTime();
if (!CanSendCleanupCommand(cpu_loads,
cpu_loads_index,
current_cleanup_time,
last_cleanup_time)) {
VLOG(1) << "CanSendCleanupCommand returned false";
last_cleanup_time = current_cleanup_time;
continue;
}
last_cleanup_time = current_cleanup_time;
VLOG(2) << "Sending Cleanup command";
client_->set_timeout(kCleanupTimeout);
if (client_->Cleanup()) {
VLOG(2) << "Cleanup command succeeded";
continue;
}
LOG(WARNING) << "Cleanup failed "
<< "execute PingCommand to check server is running";
bool failed = true;
client_->Reset();
client_->set_timeout(kPingTimeout);
for (int i = 0; i < kPingTrial; ++i) {
if (event_->Wait(kPingInterval)) {
VLOG(1) << "Received stop signal";
return;
}
if (client_->PingServer()) {
VLOG(2) << "Ping command succeeded";
failed = false;
break;
}
LOG(ERROR) << "Ping command failed, waiting "
<< kPingInterval << " msec, trial: "
<< i;
}
if (failed) {
if (event_->Wait(100)) {
VLOG(1) << "Parent thread is already terminated";
return;
}
#ifndef NO_LOGGING
// We have received crash dumps caused by the following LOG(FATAL).
// Unfortunately, we cannot investigate the cause of this error,
// as the crash dump doesn't contain any logging information.
// Here we temporary save the user name into stack in order
// to obtain the log file before the LOG(FATAL).
char user_name[32];
const string tmp = SystemUtil::GetUserNameAsString();
strncpy(user_name, tmp.c_str(), sizeof(user_name));
VLOG(1) << "user_name: " << user_name;
#endif
LOG(FATAL) << "Cleanup commands failed. Rasing exception...";
}
}
}
bool SessionWatchDog::CanSendCleanupCommand(
const volatile float *cpu_loads,
int cpu_loads_index,
uint64 current_cleanup_time,
uint64 last_cleanup_time) const {
if (current_cleanup_time <= last_cleanup_time) {
LOG(ERROR) << "time stamps are the same. clock may be altered";
return false;
}
const float all_avg =
std::accumulate(cpu_loads, cpu_loads + cpu_loads_index, 0.0)
/ cpu_loads_index;
const size_t latest_size = min(2, cpu_loads_index);
const float latest_avg =
std::accumulate(cpu_loads, cpu_loads + latest_size, 0.0)
/ latest_size;
VLOG(1) << "Average CPU load=" << all_avg
<< " latest CPU load=" << latest_avg;
if (all_avg > kMinimumAllCPULoad ||
latest_avg > kMinimumLatestCPULoad) {
VLOG(1) << "Don't send Cleanup command, since CPU load is too high: "
<< all_avg << " " << latest_avg;
return false;
}
// if the real interval from the last cleanup command
// is 2 * interval(), assume that the computer went to
// suspend mode
if ((current_cleanup_time - last_cleanup_time) > 2 * interval()) {
VLOG(1) << "Don't send cleanup because "
<< "Server went to suspend mode.";
return false;
}
VLOG(2) << "CanSendCleanupCommand passed";
return true;
}
} // namespace mozc