0
已解决
薛乘志
初级启示者
初级启示者
c++版本的问答搜索器
如果编译错误请添加编译参数如图所示:
#include <bits/stdc++.h>
#include <windows.h>
#include <conio.h>
#include <urlmon.h>
#include <pthread.h>
#pragma comment (lib, "urlmon.lib")
using namespace std;
void gotoxy(short x, short y) { //移动光标
COORD coord = {x, y};
SetConsoleCursorPosition(GetStdHandle(STD_OUTPUT_HANDLE), coord);
return;
}
void hideC() { //隐藏光标
HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
CONSOLE_CURSOR_INFO CursorInfo;
GetConsoleCursorInfo(handle, &CursorInfo);
CursorInfo.bVisible = false;
SetConsoleCursorInfo(handle, &CursorInfo);
}
string ToUtf8(const char *strAnsi) { //Ansi转为Utf8编码
UINT nLen = MultiByteToWideChar(936, 0, strAnsi, -1, NULL, 0);
WCHAR *wszBuffer = new WCHAR[nLen + 1];
nLen = MultiByteToWideChar(936, 0, strAnsi, -1, wszBuffer, nLen);
wszBuffer[nLen] = 0;
nLen = WideCharToMultiByte(CP_UTF8, 0, wszBuffer, -1, NULL, 0, NULL, NULL);
CHAR *szBuffer = new CHAR[nLen + 1];
nLen = WideCharToMultiByte(CP_UTF8, 0, wszBuffer, -1, szBuffer, nLen, NULL, NULL);
szBuffer[nLen] = 0;
string s = szBuffer;
delete []wszBuffer;
delete []szBuffer;
return s;
}
string ToAnsi(const char *szU8) { //Utf8转为Ansi编码
int wcsLen = MultiByteToWideChar(CP_UTF8, 0, szU8, strlen(szU8), 0, 0);
wchar_t* wszMultiByte = new wchar_t[wcsLen + 1];
MultiByteToWideChar(CP_UTF8, 0, szU8, strlen(szU8), wszMultiByte, wcsLen);
wszMultiByte[wcsLen] = '\0';
int ansiLen = WideCharToMultiByte(CP_ACP, 0, wszMultiByte, wcslen(wszMultiByte), 0, 0, 0, 0);
char* szAnsi = new char[ansiLen + 1];
szAnsi[ansiLen] = '\0';
WideCharToMultiByte(CP_ACP, 0, wszMultiByte, wcslen(wszMultiByte), szAnsi, ansiLen, 0, 0);
string s = szAnsi;
delete []szAnsi;
delete []wszMultiByte;
return s;
}
string NetHex(const char *webFileName) { //按照网络地址格式编码
char hex[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
char *pString = new char[strlen(webFileName)*sizeof(TCHAR) * 3];
strcpy(pString, ToUtf8(webFileName).c_str());
int nLength = strlen(pString);
char pszEncode[2048];
ZeroMemory(pszEncode, 2048);
int pos = 0;
for (int i = 0; i < nLength; i++) {
unsigned char c = pString[i];
if (c > 0x20 && c < 0x7f) {
pszEncode[pos] = c;
pos++;
} else if (c == 0x20) {
pszEncode[pos] = '+';
pos++;
} else {
pszEncode[pos] = '%';
pos++;
pszEncode[pos] = hex[c / 16];
pos++;
pszEncode[pos] = hex[c % 16];
pos++;
}
}
delete[] pString;
return pszEncode;
}
bool download(string url, string filename) { //下载文件
HRESULT ret = URLDownloadToFileA(NULL, NetHex(url.c_str()).c_str(), filename.c_str(), 0, NULL);
if (ret != S_OK) {
return false;
} else {
return true;
}
}
bool download_progress(string url, string filename) { //下载文件(进度显示)
class DownloadProgress : public IBindStatusCallback {
public:
HRESULT __stdcall QueryInterface(const IID &, void **) {
return E_NOINTERFACE;
}
ULONG STDMETHODCALLTYPE AddRef(void) {
return 1;
}
ULONG STDMETHODCALLTYPE Release(void) {
return 1;
}
HRESULT STDMETHODCALLTYPE OnStartBinding(DWORD dwReserved, IBinding *pib) {
return E_NOTIMPL;
}
virtual HRESULT STDMETHODCALLTYPE GetPriority(LONG *pnPriority) {
return E_NOTIMPL;
}
virtual HRESULT STDMETHODCALLTYPE OnLowResource(DWORD reserved) {
return S_OK;
}
virtual HRESULT STDMETHODCALLTYPE OnStopBinding(HRESULT hresult, LPCWSTR szError) {
return E_NOTIMPL;
}
virtual HRESULT STDMETHODCALLTYPE GetBindInfo(DWORD *grfBINDF, BINDINFO *pbindinfo) {
return E_NOTIMPL;
}
virtual HRESULT STDMETHODCALLTYPE OnDataAvailable(DWORD grfBSCF, DWORD dwSize, FORMATETC *pformatetc, STGMEDIUM *pstgmed) {
return E_NOTIMPL;
}
virtual HRESULT STDMETHODCALLTYPE OnObjectAvailable(REFIID riid, IUnknown *punk) {
return E_NOTIMPL;
}
virtual HRESULT __stdcall OnProgress(ULONG ulProgress, ULONG ulProgressMax, ULONG ulStatusCode, LPCWSTR szStatusText) {
if (ulProgressMax != 0) {
double percentage = ulProgress * 1.0 / ulProgressMax * 100;
gotoxy(0, 0);
printf("进度:%.2f%%", percentage);
}
return S_OK;
}
};
DownloadProgress progress;
IBindStatusCallback* callback = (IBindStatusCallback*)&progress;
HRESULT ret = URLDownloadToFileA(NULL, NetHex(url.c_str()).c_str(), filename.c_str(), 0, static_cast<IBindStatusCallback*>(&progress));
if (ret != S_OK) {
return false;
} else {
return true;
}
}
void print(string str, int len) {
for (int i = 0; i < len; i++) {
if (i >= str.size()) {
cout << " ";
} else {
if (str[i + 1] >= 128 || str[i + 1] < 0) {
cout << str[i] << str[i + 1];
i++;
} else {
cout << str[i];
}
}
}
}
int download_sum, page_sum, downloads[2048];
void *getpage(void *ids) {
int id = *((int*)(ids)); //将ids转为数值存储
delete (int*)(ids); //释放ids内存,防止内存泄露
redownload:
downloads[id] = 1; //标记为下载过
char idstr[1024];
sprintf(idstr, "%d", id); //数字转字符串
string sidstr = idstr;
while (!download("https://wenda.codingtang.com/?page=" + sidstr + "&filter=all&sort=newest", "page_" + sidstr));
download_sum++; //记录下载完成个数
printf("页面%d下载完成(%.2f%)\n", id, download_sum * 1.0 / page_sum * 100);
for (int i = id + 1; i <= page_sum; i++) { //遍历寻找未下载的页面
if (downloads[i] == 0) { //找到则重新开始下载
id = i;
Sleep(1);
goto redownload;
}
}
return 0; //否则关闭线程
}
void getp() {
system("cls");
cout << "获取总页数中...";
while (!download("https://wenda.codingtang.com/?filter=all&sort=newest", "pages"));
ifstream page("pages"); //读取下载文件
string p((istreambuf_iterator<char>(page)), (istreambuf_iterator<char>()));
page.close();
p = ToAnsi(p.c_str()); //转换编码
int sum = 0;
//从标记代码处开始寻找数字
for (int i = p.find("class=\"last page\">") + 18; i <= p.size(); i++) {
if (p[i] >= '0' && p[i] <= '9') {
sum = sum * 10 + p[i] - '0';
} else {
break;
}
}
system("cls");
cout << "获取完成,总页数:" << sum << endl;
cout << "开始下载...\n";
memset(downloads, 0, sizeof(downloads));
pthread_t ptd;
page_sum = sum;
SYSTEM_INFO sf; //读取CPU核心数,智能选择合适的线程数
GetSystemInfo(&sf);
cout << "CPU核心数:" << sf.dwNumberOfProcessors << endl;
//由于多线程原因,这里可能会导致CPU占用高(20%+)
for (int i = 1; i <= sf.dwNumberOfProcessors; i++) {
int *nid = new (int);
(*nid) = i;
pthread_create(&ptd, NULL, getpage, (void*)(nid));
}
while (download_sum < sum); //等待线程结束
cout << "下载完成,开始解析...\n";
ofstream lout("list");
for (int i = 1; i <= sum; i++) {
char idstr[1024];
sprintf(idstr, "%d", i); //数字转字符串
string sidstr = "page_";
sidstr += idstr;
ifstream page(sidstr.c_str()); //读取下载文件
string p((istreambuf_iterator<char>(page)), (istreambuf_iterator<char>()));
page.close();
p = ToAnsi(p.c_str()); //转换编码
while (p.find("<a id=\"blue\" href=\"/questions/") != p.npos) { //如果还能找到标识字符串,则继续
int num = 0, f = 0;
string t = "";
for (int i = p.find("<a id=\"blue\" href=\"/questions/") + 30; i < p.size(); i++) { //在找到处枚举
if (p[i] == '<') { //如果为<则退出
break;
}
if (p[i] >= '0' && p[i] <= '9' && f == 0) { //如果是数字就截取
num = num * 10 + p[i] - '0';
} else { //否则跳过html字符,并记录数字结束
f = 1;
}
if (f == 1) { //如果数字结束则记录标题
t += p[i];
}
}
p.erase(p.find("<a id=\"blue\" href=\"/questions/"), 30); //删除此处的标识字符串
t.erase(t.find("/\">"), 4); //删除不需要字符
while (t[0] == ' ') { //去除行首空格
t.erase(0, 1);
}
lout << num << " " << t << endl; //写入文件
}
printf("第%d页解析完成!\n", i);
}
lout.close();
}
string title[200000];
void search() {
system("cls");
cout << "输入搜索关键词:\n";
string name;
cin >> name;
system("cls");
cout << "正在读取缓存...\n";
ifstream hc("list");
if (!hc) {
cout << "无法读取缓存\n按任意键返回";
getch();
return;
}
string tit;
int id;
while (hc >> id) { //循环输入
hc.ignore();
getline(hc, tit);
title[id] = tit;
}
cout << "读取完成\n";
system("cls");
for (int i = 1; i < 200000; i++) { //枚举桶数组
if (title[i].find(name) != title[i].npos) { //查找到关键词,输出
cout << i << " " << title[i] << endl;
}
}
cout << "\n按任意键返回";
getch();
system("cls");
}
int main() {
system("title 问答搜索器(C++) 1.0");
while (1) {
gotoxy(0, 0);
cout << "1. 缓存\n";
cout << "2. 搜索\n";
int ipt = getch();
if (ipt == '1') {
getp();
}
if (ipt == '2') {
search();
}
}
return 0;
}
薛乘志在2022-01-08 12:42:45追加了内容
思路:https://wenda.codingtang.com/questions/17646/
下载文件:https://www.zhihu.com/question/443339832
@席清源