问题标题: 问答搜索器(C++) 1.0

0
0
已解决
薛乘志
薛乘志
初级启示者
初级启示者

c++版本的问答搜索器

如果编译错误请添加编译参数如图所示:


#include <bits/stdc++.h>
#include <windows.h>
#include <conio.h>
#include <urlmon.h>
#include <pthread.h>
#pragma comment (lib, "urlmon.lib")

using namespace std;

void gotoxy(short x, short y) { //移动光标
	COORD coord = {x, y};
	SetConsoleCursorPosition(GetStdHandle(STD_OUTPUT_HANDLE), coord);
	return;
}

void hideC() { //隐藏光标
	HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
	CONSOLE_CURSOR_INFO CursorInfo;
	GetConsoleCursorInfo(handle, &CursorInfo);
	CursorInfo.bVisible = false;
	SetConsoleCursorInfo(handle, &CursorInfo);
}

string ToUtf8(const char *strAnsi) { //Ansi转为Utf8编码
	UINT nLen = MultiByteToWideChar(936, 0, strAnsi, -1, NULL, 0);
	WCHAR *wszBuffer = new WCHAR[nLen + 1];
	nLen = MultiByteToWideChar(936, 0, strAnsi, -1, wszBuffer, nLen);
	wszBuffer[nLen] = 0;
	nLen = WideCharToMultiByte(CP_UTF8, 0, wszBuffer, -1, NULL, 0, NULL, NULL);
	CHAR *szBuffer = new CHAR[nLen + 1];
	nLen = WideCharToMultiByte(CP_UTF8, 0, wszBuffer, -1, szBuffer, nLen, NULL, NULL);
	szBuffer[nLen] = 0;
	string s = szBuffer;
	delete []wszBuffer;
	delete []szBuffer;
	return s;
}

string ToAnsi(const char *szU8) { //Utf8转为Ansi编码
	int wcsLen = MultiByteToWideChar(CP_UTF8, 0, szU8, strlen(szU8), 0, 0);
	wchar_t* wszMultiByte = new wchar_t[wcsLen + 1];
	MultiByteToWideChar(CP_UTF8, 0, szU8, strlen(szU8), wszMultiByte, wcsLen);
	wszMultiByte[wcsLen] = '\0';
	int ansiLen = WideCharToMultiByte(CP_ACP, 0, wszMultiByte, wcslen(wszMultiByte), 0, 0, 0, 0);
	char* szAnsi = new char[ansiLen + 1];
	szAnsi[ansiLen] = '\0';
	WideCharToMultiByte(CP_ACP, 0, wszMultiByte, wcslen(wszMultiByte), szAnsi, ansiLen, 0, 0);
	string s = szAnsi;
	delete []szAnsi;
	delete []wszMultiByte;
	return s;
}

string NetHex(const char *webFileName) { //按照网络地址格式编码
	char hex[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
	char *pString = new char[strlen(webFileName)*sizeof(TCHAR) * 3];
	strcpy(pString, ToUtf8(webFileName).c_str());
	int nLength = strlen(pString);
	char pszEncode[2048];
	ZeroMemory(pszEncode, 2048);
	int pos = 0;
	for (int i = 0; i < nLength; i++) {
		unsigned char c = pString[i];
		if (c > 0x20 && c < 0x7f) {
			pszEncode[pos] = c;
			pos++;
		} else if (c == 0x20) {
			pszEncode[pos] = '+';
			pos++;
		} else {
			pszEncode[pos] = '%';
			pos++;
			pszEncode[pos] = hex[c / 16];
			pos++;
			pszEncode[pos] = hex[c % 16];
			pos++;
		}
	}
	delete[] pString;
	return pszEncode;
}

bool download(string url, string filename) { //下载文件
	HRESULT ret = URLDownloadToFileA(NULL, NetHex(url.c_str()).c_str(), filename.c_str(), 0, NULL);
	if (ret != S_OK) {
		return false;
	} else {
		return true;
	}
}

bool download_progress(string url, string filename) { //下载文件(进度显示)
	class DownloadProgress : public IBindStatusCallback {
		public:
			HRESULT __stdcall QueryInterface(const IID &, void **) {
				return E_NOINTERFACE;
			}
			ULONG STDMETHODCALLTYPE AddRef(void) {
				return 1;
			}
			ULONG STDMETHODCALLTYPE Release(void) {
				return 1;
			}
			HRESULT STDMETHODCALLTYPE OnStartBinding(DWORD dwReserved, IBinding *pib) {
				return E_NOTIMPL;
			}
			virtual HRESULT STDMETHODCALLTYPE GetPriority(LONG *pnPriority) {
				return E_NOTIMPL;
			}
			virtual HRESULT STDMETHODCALLTYPE OnLowResource(DWORD reserved) {
				return S_OK;
			}
			virtual HRESULT STDMETHODCALLTYPE OnStopBinding(HRESULT hresult, LPCWSTR szError) {
				return E_NOTIMPL;
			}
			virtual HRESULT STDMETHODCALLTYPE GetBindInfo(DWORD *grfBINDF, BINDINFO *pbindinfo) {
				return E_NOTIMPL;
			}
			virtual HRESULT STDMETHODCALLTYPE OnDataAvailable(DWORD grfBSCF, DWORD dwSize, FORMATETC *pformatetc, STGMEDIUM *pstgmed) {
				return E_NOTIMPL;
			}
			virtual HRESULT STDMETHODCALLTYPE OnObjectAvailable(REFIID riid, IUnknown *punk) {
				return E_NOTIMPL;
			}
			virtual HRESULT __stdcall OnProgress(ULONG ulProgress, ULONG ulProgressMax, ULONG ulStatusCode, LPCWSTR szStatusText) {
				if (ulProgressMax != 0) {
					double percentage = ulProgress * 1.0 / ulProgressMax * 100;
					gotoxy(0, 0);
					printf("进度:%.2f%%", percentage);
				}
				return S_OK;
			}
	};
	DownloadProgress progress;
	IBindStatusCallback* callback = (IBindStatusCallback*)&progress;
	HRESULT ret = URLDownloadToFileA(NULL, NetHex(url.c_str()).c_str(), filename.c_str(), 0, static_cast<IBindStatusCallback*>(&progress));
	if (ret != S_OK) {
		return false;
	} else {
		return true;
	}
}

void print(string str, int len) {
	for (int i = 0; i < len; i++) {
		if (i >= str.size()) {
			cout << " ";
		} else {
			if (str[i + 1] >= 128 || str[i + 1] < 0) {
				cout << str[i] << str[i + 1];
				i++;
			} else {
				cout << str[i];
			}
		}
	}
}

int download_sum, page_sum, downloads[2048];

void *getpage(void *ids) {
	int id = *((int*)(ids)); //将ids转为数值存储
	delete (int*)(ids); //释放ids内存,防止内存泄露
redownload:
	downloads[id] = 1; //标记为下载过
	char idstr[1024];
	sprintf(idstr, "%d", id);  //数字转字符串
	string sidstr = idstr;
	while (!download("https://wenda.codingtang.com/?page=" + sidstr + "&filter=all&sort=newest", "page_" + sidstr));
	download_sum++; //记录下载完成个数
	printf("页面%d下载完成(%.2f%)\n", id, download_sum * 1.0 / page_sum * 100);
	for (int i = id + 1; i <= page_sum; i++) { //遍历寻找未下载的页面
		if (downloads[i] == 0) { //找到则重新开始下载
			id = i;
			Sleep(1);
			goto redownload;
		}
	}
	return 0; //否则关闭线程
}

void getp() {
	system("cls");
	cout << "获取总页数中...";
	while (!download("https://wenda.codingtang.com/?filter=all&sort=newest", "pages"));
	ifstream page("pages"); //读取下载文件
	string p((istreambuf_iterator<char>(page)), (istreambuf_iterator<char>()));
	page.close();
	p = ToAnsi(p.c_str()); //转换编码
	int sum = 0;
	//从标记代码处开始寻找数字
	for (int i = p.find("class=\"last page\">") + 18; i <= p.size(); i++) {
		if (p[i] >= '0' && p[i] <= '9') {
			sum = sum * 10 + p[i] - '0';
		} else {
			break;
		}
	}
	system("cls");
	cout << "获取完成,总页数:" << sum << endl;
	cout << "开始下载...\n";
	memset(downloads, 0, sizeof(downloads));
	pthread_t ptd;
	page_sum = sum;
	SYSTEM_INFO sf; //读取CPU核心数,智能选择合适的线程数
	GetSystemInfo(&sf);
	cout << "CPU核心数:" << sf.dwNumberOfProcessors << endl;
	//由于多线程原因,这里可能会导致CPU占用高(20%+)
	for (int i = 1; i <= sf.dwNumberOfProcessors; i++) {
		int *nid = new (int);
		(*nid) = i;
		pthread_create(&ptd, NULL, getpage, (void*)(nid));
	}
	while (download_sum < sum); //等待线程结束
	cout << "下载完成,开始解析...\n";
	ofstream lout("list");
	for (int i = 1; i <= sum; i++) {
		char idstr[1024];
		sprintf(idstr, "%d", i);  //数字转字符串
		string sidstr = "page_";
		sidstr += idstr;
		ifstream page(sidstr.c_str()); //读取下载文件
		string p((istreambuf_iterator<char>(page)), (istreambuf_iterator<char>()));
		page.close();
		p = ToAnsi(p.c_str()); //转换编码
		while (p.find("<a id=\"blue\" href=\"/questions/") != p.npos) { //如果还能找到标识字符串,则继续
			int num = 0, f = 0;
			string t = "";
			for (int i = p.find("<a id=\"blue\" href=\"/questions/") + 30; i < p.size(); i++) { //在找到处枚举
				if (p[i] == '<') { //如果为<则退出
					break;
				}
				if (p[i] >= '0' && p[i] <= '9' && f == 0) { //如果是数字就截取
					num = num * 10 + p[i] - '0';
				} else { //否则跳过html字符,并记录数字结束
					f = 1;
				}
				if (f == 1) { //如果数字结束则记录标题
					t += p[i];
				}
			}
			p.erase(p.find("<a id=\"blue\" href=\"/questions/"), 30); //删除此处的标识字符串
			t.erase(t.find("/\">"), 4); //删除不需要字符
			while (t[0] == ' ') { //去除行首空格
				t.erase(0, 1);
			}
			lout << num << " " << t << endl; //写入文件
		}
		printf("第%d页解析完成!\n", i);
	}
	lout.close();
}

string title[200000];

void search() {
	system("cls");
	cout << "输入搜索关键词:\n";
	string name;
	cin >> name;
	system("cls");
	cout << "正在读取缓存...\n";
	ifstream hc("list");
	if (!hc) {
		cout << "无法读取缓存\n按任意键返回";
		getch();
		return;
	}
	string tit;
	int id;
	while (hc >> id) { //循环输入
		hc.ignore();
		getline(hc, tit);
		title[id] = tit;
	}
	cout << "读取完成\n";
	system("cls");
	for (int i = 1; i < 200000; i++) { //枚举桶数组
		if (title[i].find(name) != title[i].npos) { //查找到关键词,输出
			cout << i << " " << title[i] << endl;
		}
	}
	cout << "\n按任意键返回";
	getch();
	system("cls");
}

int main() {
	system("title 问答搜索器(C++) 1.0");
	while (1) {
		gotoxy(0, 0);
		cout << "1. 缓存\n";
		cout << "2. 搜索\n";
		int ipt = getch();
		if (ipt == '1') {
			getp();
		}
		if (ipt == '2') {
			search();
		}
	}
	return 0;
}
薛乘志在2022-01-08 12:42:45追加了内容

思路:https://wenda.codingtang.com/questions/17646/

下载文件:https://www.zhihu.com/question/443339832

@席清源 


0
0
0
0
0
0
潘登
潘登
高级天翼
高级天翼

能不能升级一下,酷町侠关闭的问题就不行

我要回答