数据采集与融合技术实践作业4

news/2025/2/19 5:56:28/文章来源:https://www.cnblogs.com/fzu-ljs/p/18568905

102202143 梁锦盛

1.东方财富网信息爬取

使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。

一、作业代码与展示

1.编写代码文件

from selenium import webdriver
from selenium.webdriver.common.by import By
import mysql.connector
import time# 设置 MySQL 连接
connection = mysql.connector.connect(host="localhost",user="root",password="2896685056Qq!",database="stock_data"
)
cursor = connection.cursor()# 设置 Selenium 驱动
driver = webdriver.Chrome()# 定义板块URL字典
board_urls = {"沪深A股": "http://quote.eastmoney.com/center/gridlist.html#hs_a_board","上证A股": "http://quote.eastmoney.com/center/gridlist.html#sh_a_board","深证A股": "http://quote.eastmoney.com/center/gridlist.html#sz_a_board"
}# 遍历每个板块页面并提取数据
for board_name, url in board_urls.items():driver.get(url)time.sleep(3)  # 等待页面加载# 获取表格每一行的股票数据rows = driver.find_elements(By.XPATH, "//*[@id='table_wrapper-table']/tbody/tr")for row in rows:data = {"bStockNo": row.find_element(By.XPATH, "./td[2]/a").text,  # 股票代码"bStockName": row.find_element(By.XPATH, "./td[3]/a").text,  # 股票名称"latestPrice": row.find_element(By.XPATH, "./td[5]").text,  # 最新报价"changeRate": row.find_element(By.XPATH, "./td[6]").text,  # 涨跌幅"changeAmount": row.find_element(By.XPATH, "./td[7]").text,  # 涨跌额"volume": row.find_element(By.XPATH, "./td[8]").text,  # 成交量"turnover": row.find_element(By.XPATH, "./td[9]").text,  # 成交额"amplitude": row.find_element(By.XPATH, "./td[10]").text,  # 振幅"highPrice": row.find_element(By.XPATH, "./td[11]/span").text,  # 最高价"lowPrice": row.find_element(By.XPATH, "./td[12]").text,  # 最低价"openPrice": row.find_element(By.XPATH, "./td[13]").text,  # 今开"prevClose": row.find_element(By.XPATH, "./td[14]").text  # 昨收}# 移除百分号并转换为小数格式data["changeRate"] = float(data["changeRate"].replace('%', '')) / 100 if '%' in data["changeRate"] else float(data["changeRate"])data["amplitude"] = float(data["amplitude"].replace('%', '')) / 100 if '%' in data["amplitude"] else float(data["amplitude"])print(data)# 将数据插入 MySQLcursor.execute("""INSERT INTO stocks (bStockNo, bStockName, latestPrice, changeRate, changeAmount, volume, turnover, amplitude, highPrice, lowPrice, openPrice, prevClose, board)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", (data["bStockNo"], data["bStockName"], data["latestPrice"], data["changeRate"], data["changeAmount"],data["volume"], data["turnover"], data["amplitude"], data["highPrice"], data["lowPrice"],data["openPrice"], data["prevClose"], board_name))# 提交事务
connection.commit()# 关闭连接
cursor.close()
connection.close()
driver.quit()

2.运行结果

3.Gitee文件夹链接:https://gitee.com/liang-jinsheng-289668/project/tree/master/作业4

二、作业心得

完成这个作业，我懂得了任何使用selenium爬取不同页面的数据

2.mooc信息爬取

使用Selenium框架+MySQL爬取中国mooc网课程资源信息（课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介）

一、作业代码与展示

1.编写代码

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
import time# 设置Chrome选项
options = webdriver.ChromeOptions()
options.add_argument('--disable-extensions')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=options)# MySQL连接配置
db = pymysql.connect(host='localhost',user='root',password='2896685056Qq!',database='MoocData',charset='utf8mb4'
)
cursor = db.cursor()# 数据插入函数
def insert_data(course_info):sql = """INSERT INTO Courses (cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)VALUES (%s, %s, %s, %s, %s, %s, %s)"""cursor.execute(sql, course_info)db.commit()print(f"Data inserted: {course_info}")# 登录函数
def login():print("1")driver.get("https://www.icourse163.org")print("Accessing login page")# 点击登录按钮login_btn = driver.find_element(By.CLASS_NAME, "_3uWA6")login_btn.click()# 切换到登录框iframeiframe = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "iframe")))driver.switch_to.frame(iframe)# 输入用户名和密码driver.find_element(By.ID, "phoneipt").send_keys("13015721181")driver.find_element(By.XPATH,"//input[@type='password' and @name='email' and contains(@class, 'j-inputtext')]").send_keys("2896685056Qq!")  # 输入密码driver.find_element(By.ID, "submitBtn").click()# 切换回主内容driver.switch_to.default_content()print("Login successful")time.sleep(5)# 主程序逻辑
def scrape_courses():driver.get("https://www.icourse163.org")print("Accessing course list page")# 向下滚动以加载更多的_3KiL7类元素scroll_pause_time = 3  # 每次滚动后等待的时间last_height = driver.execute_script("return document.body.scrollHeight")while True:try:while True:# 向下滚动页面driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")time.sleep(scroll_pause_time)# 计算新的页面高度并比较new_height = driver.execute_script("return document.body.scrollHeight")if new_height == last_height:breaklast_height = new_height# 等待新的_3KiL7类元素加载完成courses = WebDriverWait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_3KiL7")))for course in courses:try:# 基本信息提取cCourse = course.find_element(By.CLASS_NAME, "_3EwZv").textprint(cCourse)cCollege = course.find_element(By.CLASS_NAME, "_2lZi3").textprint(cCollege)cCount = int(course.find_element(By.CLASS_NAME, "_3DcLu").text.replace("人参加", ""))print(cCount)# 进入课程详情页course.find_element(By.CLASS_NAME, "_3EwZv").click()driver.switch_to.window(driver.window_handles[-1])# 等待详情页加载WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, "f-richEditorText")))# 详情信息提取cBrief = driver.find_element(By.CLASS_NAME, "f-richEditorText").textprint(cBrief)cProcess = driver.find_element(By.CLASS_NAME, "text").textprint(cProcess)cTeacher = driver.find_element(By.CLASS_NAME, "f-fc3").textprint(cTeacher)cTeam = ", ".join([t.text for t in driver.find_elements(By.CLASS_NAME, "cnt h3")])# 数据插入course_info = (cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)insert_data(course_info)# 关闭详情页返回主页面driver.close()driver.switch_to.window(driver.window_handles[0])except Exception as e:print(f"Error processing course: {e}")driver.switch_to.window(driver.window_handles[0])# 检查是否存在下一页if len(courses)==0:breaktime.sleep(3)except Exception as e:print(f"Error on page: {e}")break# 程序执行
try:login()scrape_courses()
finally:driver.quit()db.close()

2.运行结果

3.Gitee文件夹链接:https://gitee.com/liang-jinsheng-289668/project/tree/master/作业4

二、作业心得

我学会了如何使用selenium去逐个点击卡片进入不同页面并返回，并通过元素类名爬取不同的信息

3.Flume日志采集

完成文档华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务

一、作业展示

任务一:Python脚本生成测试数据

任务二:配置Kafka

安装Kafka运行环境

安装Kafka客户端

设置环境变量，在kafka中创建topic

任务三: 安装Flume客户端

安装Flume运行环境

安装Flume到目录“/opt/FlumeClient”

任务四:配置Flume采集数据

二、作业心得

完成这个作业，我了解了鲲鹏架构特点和优势，学会在华为云创建管理服务器实例，掌握 Hadoop 在鲲鹏服务器上的部署，包括 HDFS 的安装配置启动，学会配置优化参数及规划资源，还学会使用云平台工具服务，如云硬盘、弹性 IP 等。认识到实时分析重要性，了解其流程架构，体会到与离线分析的不同，需根据业务场景选择合适方式。培养了数据分析思维能力，认识到离线分析在企业中的应用，意识到要不断学习新技术适应变化，提高技术水平和耐心毅力。

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.hqwc.cn/news/841070.html

如若内容造成侵权/违法违规/事实不符，请联系编程知识网进行投诉反馈email:809451989@qq.com，一经查实，立即删除！