d3.js - dc.js 系列图表 - 填充缺失数据时图表太慢
问题描述
我想创建一个多时间线图作为系列图。
我阅读了有关填充缺失数据 dc.js lineChart 的 Stack Overflow - 填充缺失的日期并在没有数据的情况下显示零
问题:我在那里实现了代码,它对单折线图效果很好。对于系列图表,我需要对其进行一些调整。它可以工作,但是性能很糟糕。
这是我们使用的示例数据:
let data = [{description: "Walmart", location: "40.216403 -74.541296", timeReported: 1581710670184}
{description: "Target", location: "38.271996 -84.032575", timeReported: 1583524065011}
{description: "Wendys", location: "39.255831 -75.532763", timeReported: 1583524065011}
{description: "7-11", location: "34.925349 -78.463977", timeReported: 1583524065011}
{description: "WaWa", location: "35.716208 -77.741230", timeReported: 1583524065013}
{description: "7-11", location: "41.258950 -83.888060", timeReported: 1583524065013}
{description: "Shell", location: "37.879694 -79.836127", timeReported: 1583524065011}
{description: "Dominos", location: "35.890273 -80.700329", timeReported: 1583524065395}
{description: "Dominos", location: "39.268777 -78.743366", timeReported: 1583524065397}
{description: "Walgreens", location: "35.490215 -81.773863", timeReported: 1583524065399}
{description: "7-11", location: "37.974797 -81.393449", timeReported: 1583524065506}
{description: "Wendys", location: "40.859685 -76.963065", timeReported: 1583524065521}
{description: "CVS", location: "38.517910 -78.251419", timeReported: 1583524065553}
{description: "CVS", location: "35.947033 -81.616061", timeReported: 1583524142169}
{description: "Shell", location: "39.566535 -77.992499", timeReported: 1583524142176}
{description: "Target", location: "37.832142 -88.003151", timeReported: 1583524142170}
{description: "Wendys", location: "40.245397 -80.061998", timeReported: 1583524142223}
{description: "Macys", location: "39.631265 -75.157194", timeReported: 1583524142223}
{description: "Macys", location: "36.631458 -77.803286", timeReported: 1583524142213}
{description: "7-11", location: "36.249754 -79.830006", timeReported: 1583524142251}
{description: "7-11", location: "41.138285 -83.298142", timeReported: 1583524142249}
{description: "Wendys", location: "34.940485 -77.230388", timeReported: 1583524142249}
{description: "7-11", location: "39.605373 -77.448768", timeReported: 1583524142296}
{description: "Wendys", location: "35.609094 -79.455712", timeReported: 1583524142293}
{description: "WaWa", location: "37.130753 -78.076709", timeReported: 1583524142310}
{description: "Macys", location: "40.058482 -78.497258", timeReported: 1583524142338}
{description: "Wendys", location: "39.255831 -75.532763", timeReported: 1582058735883}
{description: "Macys", location: "39.631265 -75.157194", timeReported: 1582058735883}
{description: "7-11", location: "36.249754 -79.830006", timeReported: 1582058735883}
{description: "7-11", location: "39.605373 -77.448768", timeReported: 1582058735883}
{description: "Wendys", location: "35.609094 -79.455712", timeReported: 1582058735883}
{description: "WaWa", location: "37.130753 -78.076709", timeReported: 1582058735883}
{description: "Macys", location: "40.058482 -78.497258", timeReported: 1582058735883}
{description: "Kohls", location: "40.373533 -101.057470", timeReported: 1582838559493}]
这是示例代码。顺便说一句,下面代码中的 curTimeInterval 只是 d3 timeIntervlas 的别名,可以由用户选择。(d3.timeHour、d3.timeDay、d3.timeWeek、d3.timeMonth)。
cf = crossfilter(data);
dateDim = cf.dimension((d) => {
return curTimeInterval(d.timeReportedDate);
});
reportedGroup = dateDim.group().reduceSum((d) => 1);
let minDate = d3.min(reportedGroup.all(), (kv) => {
return kv.key;
});
let maxDate = d3.max(reportedGroup.all(), (kv) => {
return kv.key;
});
minDate = curTimeInterval.offset(minDate, -2);
maxDate = curTimeInterval.offset(maxDate, 2);
const runDimension = cf.dimension((d) => {
return [d.description, curTimeInterval(d.timeReportedDate)];
});
const runGroup = runDimension.group();
// Fills the missing data in the group
const filledSeries = fill_composite_intervals(runGroup, curTimeInterval);
const seriesChart = new dc.SeriesChart('#series');
seriesChart
.width(768)
.height(480)
.chart(function(c) {
return new dc.LineChart(c).curve(d3.curveCardinal);
})
.x(d3.scaleTime().domain([minDate, maxDate]))
.xUnits(curTimeInterval.range)
.brushOn(false)
.clipPadding(10)
.elasticY(true)
.dimension(runDimension)
.group(filledSeries)
.mouseZoomable(true)
.seriesAccessor((d) => {
return d.key[0];
})
.keyAccessor((d) => {
return d.key[1];
})
.valueAccessor((d) => {
return d.value;
})
.legend(dc.legend().x(350).y(350).itemHeight(13).gap(5).horizontal(1).legendWidth(140).itemWidth(70))
.yAxis()
.tickValues(d3.range(min > 0 ? min - 1 : min, max + 1));
seriesChart.margins().left += 40;
fill_composite_intervals = (group, interval) => {
return {
all: function() {
const retVal = [];
const allArray = group.all();
if (!allArray.length) {
return retVal;
}
allArray.sort((a, b) => {
if (a.key[1].getTime() < b.key[1].getTime()) {
return -1;
}
if (a.key[1].getTime() > b.key[1].getTime()) {
return 1;
}
// a must be equal to b
return 0;
});
const target = interval.range(allArray[0].key[1], allArray[allArray.length-1].key[1]);
const allMap = new Map();
allArray.forEach((obj) => {
let innerArray = allMap.get(obj.key[0]);
if (!innerArray) {
innerArray = [];
allMap.set(obj.key[0], innerArray);
}
innerArray.push({key: obj.key[1], value: obj.value});
});
allMap.forEach((value, key, map) => {
const orig = value.map((kv) => ({key: new Date(kv.key), value: kv.value}));
const result = [];
if (orig.length) {
let oi;
let ti;
for (oi = 0, ti = 0; oi < orig.length && ti < target.length;) {
if (orig[oi].key <= target[ti]) {
result.push(orig[oi]);
if (orig[oi++].key.valueOf() === target[ti].valueOf()) {
++ti;
}
} else {
result.push({key: target[ti], value: 0});
++ti;
}
}
if (oi<orig.length) {
Array.prototype.push.apply(result, orig.slice(oi));
}
if (ti<target.length) {
Array.prototype.push.apply(result, target.slice(ti).map((t) => ({key: t, value: 0})));
}
}
map.set(key, result);
});
allMap.forEach((value, key, map) => {
value.forEach((obj) => {
const newObj = {
key: [key, obj.key],
value: obj.value
};
retVal.push(newObj);
});
});
return retVal;
}
};
};
解决方案
我首先创建了一个说明问题的小提琴。这里有趣的是一个选择菜单,它显示了哪些时间间隔适合图表的数据和缩放级别(域)。
显示超过 width/2 点是不合适的(因为它们不会被渲染),并且显示少于两个点也不合适,所以“不合适”的选项是灰色的斜体:
它使用对象将间隔名称映射到相应 d3 间隔中的毫秒数:
const intervals = {
timeSecond: 1000,
timeMinute: 60000,
timeHour: 3600000,
timeDay: 86400000,
timeWeek: 604800000,
timeMonth: 2628000000,
timeYear: 31536000000
}
allowed_intervals
确定第一个和最后一个适当的间隔:
function allowed_intervals(chart, intervals, dateDomain) {
const dt = dateDomain[1].getTime() - dateDomain[0].getTime(),
first = Object.entries(intervals).find(
([iname, ms]) => dt / ms < chart.width() / 2);
if(!first)
throw new Error('date range too long')
const last = Object.entries(intervals).reverse().find(
([iname, ms]) => d3[iname](dateDomain[0]).getTime() !== d3[iname](dateDomain[1]).getTime());
return [first[0],last[0]];
}
所以这一切都很好。该示例打印了结果数据,我们可以看到,如果我们用d3.timeMinute
它填充示例数据,则会从原始的 15 个数据点生成 332482 个数据点。这显然是太多的数据,尤其是对于一个简单的示例。
这是一个可以找到合适的 d3 时间间隔的算法。但是,当我们启用缩放时它会失败,因为现在我们可以放大到一个小时,比如说,在timeMinute
合适的地方,但是如果你对所有数据使用那个间隔,它的点太多了,图表会减速到停止。
所以我开始思考如何让它更有效率。我们实际上不需要填充每个缺失的时间间隔。我们真正需要的是确保我们捕捉到下降沿,当数据从非零变为零时,以及上升沿,当数据从零变为非零时。在这些情况下,我们只需要向输入数据添加零。
这是一个fill_composite_intervals
使用上升沿和下降沿的新版本,只添加了显示这些沿所需的零:
// input: a group with keys [category, time] and numeric values; a d3 time interval
// output: the same, but with zeroes filled in per the interval
function fill_composite_intervals(group, interval) {
return {
all: function() {
const retVal = [];
const allArray = group.all().slice();
if (!allArray.length) {
return retVal;
}
// make sure input data is sorted
allArray.sort((a, b) => a.key[1].getTime() - b.key[1].getTime());
// find all time intervals within the data
// pad at both ends to add leading and trailing zeros
const target = interval.range(interval.offset(allArray[0].key[1], -1),
interval.offset(allArray[allArray.length-1].key[1], 2));
// separate the data for each category
const allMap = new Map();
allArray.forEach(({key: [cat, time], value}) => {
let innerArray = allMap.get(cat);
if (!innerArray) {
innerArray = [];
allMap.set(cat, innerArray);
}
innerArray.push({key: time, value});
});
// walk each category, adding leading and trailing zeros
allMap.forEach((value, key, map) => {
const orig = value.map(({key, value}) => ({key: new Date(key), value}));
const result = [];
if (orig.length) {
let oi = 0, ti = 0, last_filled = false, skipped_fill = false;
while(oi < orig.length && ti < target.length) {
if (orig[oi].key <= target[ti]) {
if(skipped_fill) {
// in the last iteration, we skipped a zero
// so add one now (rising edge)
result.push({key: target[ti-1], value: 0});
skipped_fill = false;
}
result.push(orig[oi]);
if (orig[oi++].key.getTime() === target[ti].getTime()) {
++ti;
}
last_filled = false;
} else {
if(!last_filled) {
// last iteration we pushed a value
// so push a zero now (falling edge)
result.push({key: target[ti], value: 0});
last_filled = true;
}
else skipped_fill = true;
++ti;
}
}
if (oi<orig.length) {
Array.prototype.push.apply(result, orig.slice(oi));
}
if (ti<target.length) {
// add one trailing zero at the end
result.push({key: target[ti], value: 0});
}
}
map.set(key, result);
});
allMap.forEach((value, key, map) => {
value.forEach(({key: time, value}) => {
retVal.push({
key: [key, time],
value
});
});
});
return retVal;
}
};
}
请参阅代码中的注释以获取解释。它只生成与输入数据成比例的数据,例如输入 15 的 67 个点timeMinute
,而不是 300+K。
有趣的是,我发现d3.curveCardinal
当零较少时会产生奇怪的伪影。直觉上,我认为如果跳过点,这条线会获得太多的“动力”。所以我选择了d3.curveMonotoneX。我觉得还是比较合适。
.curve(d3.curveMonotoneX)
我还在开头和结尾填充了interval.range
,以便数据以零开始和结束,这更吸引人。
当您选择时,此示例仍然很慢d3.timeSecond
(它仍然迭代 300+K 点),但它似乎执行得还不错timeMinute
,这似乎捕获了此数据的分辨率。
进一步可能的改进:
- 添加更多前导零和尾随零,以使曲线一致/对称
- 停止使用
interval.range
,以免计算和丢弃那么多点;相反,仅使用和下一个/最后一个数据点检测上升沿和下降沿interval.offset
(棘手!)