scrapMain.py
# -*- coding:utf-8 -*-import osimport xlrdimport Queueimport timefrom Excel_Main import Excel_Mainfrom ScrapData import ScrapDatafrom multiThread import MyThreadfrom write2Excel import writeRatioDateSHARE_Q = Queue.Queue() # 构造一个不限制大小的的队列DATA_SET = set() # 数据集合_WORKER_THREAD_NUM = 4 # 设置线程的个数def handleExcel(fileUrl='Excel.xlsx'): ''' 将原始 Excel.xlsx 转化为: Intelligent_analysis.xlsx Args: file: 待转化的Excel ''' excel = Excel_Main() assert os.path.exists(fileUrl) excel.handle(fileUrl) print("Creat \'Intelligent_analysis.xlsx\' successfully!")def getLinks(): ''' 获取 Intelligent_analysis.xlsx 内的链接 Returns: tcIndex_link_set: (index, link) ''' workBook = xlrd.open_workbook("Intelligent_analysis.xlsx") workSheet = workBook.sheets()[0] # 链接(log trace)位于 L 列 tcIndex_link_set = zip(workSheet.col_values(0), workSheet.col_values(11)) print("Get links from \'Intelligent_analysis.xlsx\'!") print(len(tcIndex_link_set)) del tcIndex_link_set[0] return tcIndex_link_setdef worker() : """ 主要用来写工作逻辑, 只要队列不空持续处理 队列为空时, 检查队列, 由于Queue中已经包含了wait, notify和锁, 所以不需要在取任务或者放任务的时候加锁解锁 """ global SHARE_Q global DATA_SET while not SHARE_Q.empty(): start = time.time() tcIndex_link_set = SHARE_Q.get() #获得任务 scrapData = ScrapData() tcIndex_ratio_set = scrapData.getPassRation(tcIndex_link_set) DATA_SET.add(tcIndex_ratio_set) end = time.time() print("<<<<<<<<<<<<<<<<<<<<<<=================>>>>>>>>>>>>>>>>>>>>>>") print("One job Done! PassRatio: %s | Used time: %s | TotalItem: %i" % (tcIndex_ratio_set, (end-start), len(DATA_SET))) SHARE_Q.task_done() def main() : global SHARE_Q threads = [] # 预处理 handleExcel() tcIndex_link_set = getLinks() #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务 for job in tcIndex_link_set: SHARE_Q.put(job) #开启_WORKER_THREAD_NUM个线程 for i in xrange(_WORKER_THREAD_NUM): thread = MyThread(worker) thread.start() #线程开始处理任务 threads.append(thread) # for thread in threads : # thread.join() #等待所有任务完成 SHARE_Q.join() # 写入Excel.xlsx writeRatioDate(DATA_SET, hightlightNum=5)if __name__ == '__main__': startTime = time.time() main() print("Append PassRatio successfully!") endTime = time.time() print("Totally used time: %s" % (endTime-startTime))