javascript - 我可以在没有数据库的情况下在客户端的静态网站上轻松实现全文搜索吗?
问题描述
我在某个地方找到了这个脚本……实际上可能在 npm 的源代码中……不确定,我只知道我不是自己写的……但是看着它,我不禁想知道它是否或类似的东西重构以下代码可以允许对静态站点进行快速网络爬网并返回一个 url 列表,这些 url 指向搜索词中点击次数最多的页面......我不需要像模糊搜索这样的花哨的东西,我也不需要我要求任何人为我编写代码,就像我希望第二(或第三)双眼睛查看此代码并确定其中是否有实现简单全文搜索的潜力。
const fs = require("fs");
const path = require("path");
const npm = require("./npm.js");
const color = require("ansicolors");
const output = require("./utils/output.js");
const usageUtil = require("./utils/usage.js");
const { promisify } = require("util");
const glob = promisify(require("glob"));
const readFile = promisify(fs.readFile);
const didYouMean = require("./utils/did-you-mean.js");
const { cmdList } = require("./utils/cmd-list.js");
const usage = usageUtil("help-search", "npm help-search <text>");
const completion = require("./utils/completion/none.js");
const npmUsage = require("./utils/npm-usage.js");
const cmd = (args, cb) =>
helpSearch(args)
.then(() => cb())
.catch(cb);
const helpSearch = async (args) => {
if (!args.length) throw usage;
const docPath = path.resolve(__dirname, "..", "docs/content");
const files = await glob(`${docPath}/*/*.md`);
const data = await readFiles(files);
const results = await searchFiles(args, data, files);
// if only one result, then just show that help section.
if (results.length === 1) {
return npm.commands.help([path.basename(results[0].file, ".md")], (er) => {
if (er) throw er;
});
}
const formatted = formatResults(args, results);
if (!formatted.trim()) npmUsage(false);
else {
output(formatted);
output(didYouMean(args[0], cmdList));
}
};
const readFiles = async (files) => {
const res = {};
await Promise.all(
files.map(async (file) => {
res[file] = (await readFile(file, "utf8"))
.replace(/^---\n(.*\n)*?---\n/, "")
.trim();
})
);
return res;
};
const searchFiles = async (args, data, files) => {
const results = [];
for (const [file, content] of Object.entries(data)) {
const lowerCase = content.toLowerCase();
// skip if no matches at all
if (!args.some((a) => lowerCase.includes(a.toLowerCase()))) continue;
const lines = content.split(/\n+/);
// if a line has a search term, then skip it and the next line.
// if the next line has a search term, then skip all 3
// otherwise, set the line to null. then remove the nulls.
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const nextLine = lines[i + 1];
let match = false;
if (nextLine) {
match = args.some((a) =>
nextLine.toLowerCase().includes(a.toLowerCase())
);
if (match) {
// skip over the next line, and the line after it.
i += 2;
continue;
}
}
match = args.some((a) => line.toLowerCase().includes(a.toLowerCase()));
if (match) {
// skip over the next line
i++;
continue;
}
lines[i] = null;
}
// now squish any string of nulls into a single null
const pruned = lines.reduce((l, r) => {
if (!(r === null && l[l.length - 1] === null)) l.push(r);
return l;
}, []);
if (pruned[pruned.length - 1] === null) pruned.pop();
if (pruned[0] === null) pruned.shift();
// now count how many args were found
const found = {};
let totalHits = 0;
for (const line of pruned) {
for (const arg of args) {
const hit =
(line || "").toLowerCase().split(arg.toLowerCase()).length - 1;
if (hit > 0) {
found[arg] = (found[arg] || 0) + hit;
totalHits += hit;
}
}
}
const cmd = "npm help " + path.basename(file, ".md").replace(/^npm-/, "");
results.push({
file,
cmd,
lines: pruned,
found: Object.keys(found),
hits: found,
totalHits,
});
}
// sort results by number of results found, then by number of hits
// then by number of matching lines
// coverage is ignored here because the contents of results are
// nondeterministic due to either glob or readFiles or Object.entries
return results
.sort(
/* istanbul ignore next */ (a, b) =>
a.found.length > b.found.length
? -1
: a.found.length < b.found.length
? 1
: a.totalHits > b.totalHits
? -1
: a.totalHits < b.totalHits
? 1
: a.lines.length > b.lines.length
? -1
: a.lines.length < b.lines.length
? 1
: 0
)
.slice(0, 10);
};
const formatResults = (args, results) => {
const cols = Math.min(process.stdout.columns || Infinity, 80) + 1;
const out = results
.map((res) => {
const out = [res.cmd];
const r = Object.keys(res.hits)
.map((k) => `${k}:${res.hits[k]}`)
.sort((a, b) => (a > b ? 1 : -1))
.join(" ");
out.push(
" ".repeat(Math.max(1, cols - out.join(" ").length - r.length - 1))
);
out.push(r);
if (!npm.flatOptions.long) return out.join("");
out.unshift("\n\n");
out.push("\n");
out.push("-".repeat(cols - 1) + "\n");
res.lines.forEach((line, i) => {
if (line === null || i > 3) return;
if (!npm.color) {
out.push(line + "\n");
return;
}
const hilitLine = [];
for (const arg of args) {
const finder = line.toLowerCase().split(arg.toLowerCase());
let p = 0;
for (const f of finder) {
hilitLine.push(line.substr(p, f.length));
const word = line.substr(p + f.length, arg.length);
const hilit = color.bgBlack(color.red(word));
hilitLine.push(hilit);
p += f.length + arg.length;
}
}
out.push(hilitLine.join("") + "\n");
});
return out.join("");
})
.join("\n");
const finalOut =
results.length && !npm.flatOptions.long
? "Top hits for " +
args.map(JSON.stringify).join(" ") +
"\n" +
"—".repeat(cols - 1) +
"\n" +
out +
"\n" +
"—".repeat(cols - 1) +
"\n" +
"(run with -l or --long to see more context)"
: out;
return finalOut.trim();
};
module.exports = Object.assign(cmd, { usage, completion });
解决方案
根据您网站的结构和生成方式,我不明白为什么客户端文本搜索不起作用。我不建议在客户端抓取该站点,因此最好在构建时生成一个数据文件,然后以此为基础进行搜索。
如果您的静态站点是使用静态站点生成器生成的,您也许可以让静态站点生成器创建一个包含所有内容的 JSON 文件。否则,如果它只是静态资产,您可能会创建一个脚本来读取您的内容并以这种方式创建数据文件。
还有很多可用的库可以搜索 JSON 对象,例如fuse.js。
客户端搜索的主要问题是要搜索的文本量。如果您有很多内容,客户端必须将所有内容加载到内存中,这可能是一个问题,尽管您必须针对您的特定用例进行测试。
推荐阅读
- python - 为 OCR 清理本书页面的最佳方法
- javascript - 使用服务器端事件 (SSE) 将更新推送到使用 Javascript 的 Web 客户端
- reactjs - React Native BackHandler 无法正常工作
- html - div背景图像未显示
- setuptools - 当我尝试将包上传到 PyPI 时,为什么会上传失败?
- powerbi - DAX:根据持续时间和顺序计算开始日期
- c++ - LoopInfoBase 类的迭代器
? - java - org.hibernate.boot.registry.classloading.spi.ClassLoadingException:无法加载类 [emp.hbm.xml]
- wiremock - 如何使用 xPath 获取 xml 属性变量(双精度类型)以响应使用 Wiremock
- c# - asp.net 核心试图选择包含确切 ID 的对象