postgresql - 如何在不先连接所有内容的情况下增量散列postgresql中的聚合结果
问题描述
我想计算一组结果的哈希(sha256),我知道该怎么做:
SELECT digest( string_agg(id_,':'), 'sha256') from mytable order by id_;
它可以工作,但它首先连接所有记录,最后计算哈希。我希望它是增量的,以避免大型数据集(数百万行)可能出现的内存问题,例如:
SELECT digest_agg(id_, ':', 'sha256') from mytable order by id_;
确保散列在途中逐行递增计算。
解决方案
好的,我终于为此编写了 ac 扩展名。我把它贴在这里,以防它对某人有用。
/* ------- OS Includes ------------ */
#include <stdio.h>
#include <openssl/rsa.h>
#include <openssl/pem.h>
#include <openssl/err.h>
/* ----- PostgreSQL Includes -------*/
#include "postgres.h"
#include "fmgr.h"
#include "funcapi.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
PG_MODULE_MAGIC;
typedef struct sha256_state
{
SHA256_CTX* sha256;
bool has_data;
} SHA256_STATE;
/* --------- Prototypes ---------- */
void _PG_init(void);
void _PG_fini(void);
Datum dv_sha256_agg_sfunc(PG_FUNCTION_ARGS);
Datum dv_sha256_agg_final(PG_FUNCTION_ARGS);
SHA256_STATE* sha256_init(void);
void sha256_update(SHA256_STATE* state, char* string, int32 string_size);
void sha256_final(SHA256_STATE* state, char outputBuffer[65]);
/* ------ Version1 convention ---- */
PG_FUNCTION_INFO_V1(dv_sha256_agg_sfunc);
PG_FUNCTION_INFO_V1(dv_sha256_agg_final);
// -----------------------------------------------------------
// Implementations
// -----------------------------------------------------------
void _PG_init(void) {/* Do nothing */}
void _PG_fini(void) {/* Do nothing */}
Datum dv_sha256_agg_sfunc(PG_FUNCTION_ARGS) {
SHA256_STATE* state;
bool is_first = PG_ARGISNULL(0);
text* string;
int32 string_size;
char* data;
char SEP[] = {'\n'}; // Hardcoded separator (can be improved)
if (is_first) {
// First iteration: Create the state
state = sha256_init();
}
else {
// Next iterations: Restore the state
state = (SHA256_STATE*) PG_GETARG_POINTER(0);
}
if (PG_ARGISNULL(1)) {
PG_RETURN_POINTER(state);
}
// Get current chunk
string = PG_GETARG_TEXT_PP(1);
string_size = VARSIZE_ANY_EXHDR(string);
data = VARDATA_ANY(string);
// Add separator
if (state->has_data) {
sha256_update(state, SEP, 1);
}
// Update state
sha256_update(state, data, string_size);
state->has_data = true;
// Updated state
PG_RETURN_POINTER(state);
}
Datum dv_sha256_agg_final(PG_FUNCTION_ARGS) {
SHA256_STATE* state;
char out[65];
text* hash = (text*) palloc(65 + VARHDRSZ);
if (PG_ARGISNULL(0)) {
PG_RETURN_NULL();
}
else {
state = (SHA256_STATE*) PG_GETARG_POINTER(0);
sha256_final(state, out);
SET_VARSIZE(hash, 65 + VARHDRSZ);
memcpy(VARDATA(hash), out, 65);
PG_RETURN_TEXT_P(hash);
}
}
SHA256_STATE* sha256_init() {
SHA256_STATE* state = (SHA256_STATE*) palloc(sizeof(SHA256_STATE));
state->sha256 = (SHA256_CTX*) palloc(sizeof(SHA256_CTX));
SHA256_Init(state->sha256);
state->has_data = false;
return state;
}
void sha256_update(SHA256_STATE* state, char* string, int32 string_size) {
SHA256_Update(state->sha256, string, string_size);
}
void sha256_final(SHA256_STATE* state, char outputBuffer[65]) {
int i;
unsigned char hash[SHA256_DIGEST_LENGTH];
SHA256_Final(hash, state->sha256);
for(i = 0; i < SHA256_DIGEST_LENGTH; i++) {
sprintf(outputBuffer + (i * 2), "%02x", hash[i]);
}
outputBuffer[64] = 0;
}
聚合定义:
CREATE FUNCTION dv_sha256_agg_sfunc(state internal, input text)
RETURNS internal AS 'MODULE_PATHNAME',
'dv_sha256_agg_sfunc'
LANGUAGE C VOLATILE;
CREATE FUNCTION dv_sha256_agg_final(state internal)
RETURNS text AS 'MODULE_PATHNAME',
'dv_sha256_agg_final'
LANGUAGE C VOLATILE;
CREATE AGGREGATE dv_sha256_agg(input text) (
SFUNC = dv_sha256_agg_sfunc,
STYPE = internal,
FINALFUNC = dv_sha256_agg_final
);
测试:
select
dv_sha256_agg(id_::text),
encode(digest(string_agg(id_::text,E'\n'),'sha256'),'hex')
from
generate_series(1,100) id_(id_);
结果
dv_sha256_agg | encode
------------------------------------------------------------------+------------------------------------------------------------------
4187fe63fa78d8b4333e6ffc9122e0273ddf90251ced32e1e5b398639c193c87 | 4187fe63fa78d8b4333e6ffc9122e0273ddf90251ced32e1e5b398639c193c87
(1 row)
笔记:
- 分隔符被硬编码为 \n
- 空值被忽略
推荐阅读
- r - 在 R 中使用 terra 包从不同程度的栅格镶嵌错误?
- git - 如何获取特定版本的 Chromium 源代码?
- swift - 如何在swift中进行函数回调
- node.js - 如何解决 Google App Engine 上 VueJs / Express / Mongo 应用程序的 API 响应的“启用 Javascript”错误
- python - 我可以从 Django views.py 中的另一个函数调用一个函数吗?
- python - 为什么要将 RNN/LSTM 的训练和测试数据用于股票预测?
- laravel - 表会话 laravel 中的 user_id 用于多守卫
- swift - 将旋转的 NSImageView 作为子视图的 NSView 保存到 pdf 文件的问题 - NSImageView 未旋转
- python - (堆叠)带有单独标记观察的直方图
- java - 从异步改造调用响应填充列表的问题