练习43：一个简单的统计引擎

计算标准差和均值

`sum`

`sumsq`（平方和）

`count(n)`

`min`

`max`

`mean`

`stddev`

``````> s <- runif(n=10, max=10)
> s
[1] 6.1061334 9.6783204 1.2747090 8.2395131 0.3333483 6.9755066 1.0626275
[8] 7.6587523 4.9382973 9.5788115
> summary(s)
Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
0.3333  2.1910  6.5410  5.5850  8.0940  9.6780
> sd(s)
[1] 3.547868
> sum(s)
[1] 55.84602
> sum(s * s)
[1] 425.1641
> sum(s) * mean(s)
[1] 311.8778
> sum(s * s) - sum(s) * mean(s)
[1] 113.2863
> (sum(s * s) - sum(s) * mean(s)) / (length(s) - 1)
[1] 12.58737
> sqrt((sum(s * s) - sum(s) * mean(s)) / (length(s) - 1))
[1] 3.547868
>
``````

lines 1-4

lines 5-7

lines 8-9

lines 10-11

lines 12-13

`stddev`公式中的下一部分是`sumsq`，我可以通过`sum(s * s)`来得到，它告诉R将整个`s`列表乘以其自身，之后计算它们的`sum`。R的可以在整个数据结构上做运算，就像这样。

lines 14-15

lines 16-17

lines 18-19

lines 20-21

实现

``````#ifndef lcthw_stats_h
#define lctwh_stats_h

typedef struct Stats {
double sum;
double sumsq;
unsigned long n;
double min;
double max;
} Stats;

Stats *Stats_recreate(double sum, double sumsq, unsigned long n, double min, double max);

Stats *Stats_create();

double Stats_mean(Stats *st);

double Stats_stddev(Stats *st);

void Stats_sample(Stats *st, double s);

void Stats_dump(Stats *st);

#endif
``````

``````#include <math.h>
#include <lcthw/stats.h>
#include <stdlib.h>
#include <lcthw/dbg.h>

Stats *Stats_recreate(double sum, double sumsq, unsigned long n, double min, double max)
{
Stats *st = malloc(sizeof(Stats));
check_mem(st);

st->sum = sum;
st->sumsq = sumsq;
st->n = n;
st->min = min;
st->max = max;

return st;

error:
return NULL;
}

Stats *Stats_create()
{
return Stats_recreate(0.0, 0.0, 0L, 0.0, 0.0);
}

double Stats_mean(Stats *st)
{
return st->sum / st->n;
}

double Stats_stddev(Stats *st)
{
return sqrt( (st->sumsq - ( st->sum * st->sum / st->n)) / (st->n - 1) );
}

void Stats_sample(Stats *st, double s)
{
st->sum += s;
st->sumsq += s * s;

if(st->n == 0) {
st->min = s;
st->max = s;
} else {
if(st->min > s) st->min = s;
if(st->max < s) st->max = s;
}

st->n += 1;
}

void Stats_dump(Stats *st)
{
fprintf(stderr, "sum: %f, sumsq: %f, n: %ld, min: %f, max: %f, mean: %f, stddev: %f",
st->sum, st->sumsq, st->n, st->min, st->max,
Stats_mean(st), Stats_stddev(st));
}
``````

Stats_recreate

Stats_create

Stats_mean

Stats_stddev

Stats_sample

Stats_dump

``````#include "minunit.h"
#include <lcthw/stats.h>
#include <math.h>

const int NUM_SAMPLES = 10;
double samples[] = {
6.1061334, 9.6783204, 1.2747090, 8.2395131, 0.3333483,
6.9755066, 1.0626275, 7.6587523, 4.9382973, 9.5788115
};

Stats expect = {
.sumsq = 425.1641,
.sum = 55.84602,
.min = 0.333,
.max = 9.678,
.n = 10,
};
double expect_mean = 5.584602;
double expect_stddev = 3.547868;

#define EQ(X,Y,N) (round((X) * pow(10, N)) == round((Y) * pow(10, N)))

char *test_operations()
{
int i = 0;
Stats *st = Stats_create();
mu_assert(st != NULL, "Failed to create stats.");

for(i = 0; i < NUM_SAMPLES; i++) {
Stats_sample(st, samples[i]);
}

Stats_dump(st);

mu_assert(EQ(st->sumsq, expect.sumsq, 3), "sumsq not valid");
mu_assert(EQ(st->sum, expect.sum, 3), "sum not valid");
mu_assert(EQ(st->min, expect.min, 3), "min not valid");
mu_assert(EQ(st->max, expect.max, 3), "max not valid");
mu_assert(EQ(st->n, expect.n, 3), "max not valid");
mu_assert(EQ(expect_mean, Stats_mean(st), 3), "mean not valid");
mu_assert(EQ(expect_stddev, Stats_stddev(st), 3), "stddev not valid");

return NULL;
}

char *test_recreate()
{
Stats *st = Stats_recreate(expect.sum, expect.sumsq, expect.n, expect.min, expect.max);

mu_assert(st->sum == expect.sum, "sum not equal");
mu_assert(st->sumsq == expect.sumsq, "sumsq not equal");
mu_assert(st->n == expect.n, "n not equal");
mu_assert(st->min == expect.min, "min not equal");
mu_assert(st->max == expect.max, "max not equal");
mu_assert(EQ(expect_mean, Stats_mean(st), 3), "mean not valid");
mu_assert(EQ(expect_stddev, Stats_stddev(st), 3), "stddev not valid");

return NULL;
}

char *all_tests()
{
mu_suite_start();

mu_run_test(test_operations);
mu_run_test(test_recreate);

return NULL;
}

RUN_TESTS(all_tests);
``````

如何使用

• 用户John登录并登出服务器A。获取服务器A的统计量，并更新它们。
• 获取`mean of means`统计量，计算出A的均值并且将其加入样本。我叫它`m_of_m`
• 获取`mean of stddev`统计量，将A的标准差添加到样本中。我叫它`m_of_s`
• 如果A的`mean``m_of_m.mean + 2 * m_of_m.stddev`范围外，标记它可能存在问题。
• 如果A的`stddev``m_of_s.mean + 2 * m_of_s.stddev`范围外，标记它可能存在行为异常。
• 最后，如果John的登录时长在A的范围之外，或A的`m_of_m`范围之外，标记为有趣的。

附加题

• `Stats_stddev``Stats_mean`转换为`static inline`函数，放到`stats.h`文件中，而不是`stats.c`文件。
• 使用这份代码来编写`string_algos_test.c`的性能测试。使它为可选的，并且运行基准测试作为一系列样本，之后报告结果。
• 编写它的另一个语言的版本。确保这个版本基于我的数据正确执行。
• 编写一个小型程序，它能从文件读取所有数字，并执行这些统计。
• 使程序接收一个数据表，其中第一行是表头，剩下的行含有任意数量空格分隔的数值。你的程序应该按照表头中的名称，打印出每一列的统计值。