有一个网站
https://www.houzz.ru/ideabooks/
有一个代码
import requests
from bs4 import BeautifulSoup
from time import sleep
# import time
from random import randrange
import json
def get_first_news():
url = "https://www.houzz.ru/ideabooks/"
headers = {
'Accept': '*/*',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
r = requests.get(url=url, headers=headers)
soup = BeautifulSoup(r.text, "lxml")
for p in range(11, 33, 11):
url = f"https://www.houzz.ru/ideabooks/p/{p}"
print(url)
r = requests.get(url=url, headers=headers)
sleep(3)
soup = BeautifulSoup(r.text, "lxml")
all_name_links = soup.find_all(class_="gallery-card hz-browse-galleries-list__gallery")
news_dict = {}
for item in all_name_links:
author_name = item.find("a", class_="gallery-text__author-name").text.strip()
image_post = item.find("img", class_="gallery-image__responsive")['src']
dir_post = item.find("div", class_="gallery-text__description text-l").text.strip()
comment_post = item.find("a", class_="gallery-text__comments").text.strip()
title_post = item.find("a", class_="gallery-text__title hz-track-me").text.strip()
link_post = item.find("a", class_="gallery-text__title hz-track-me")["href"]
# print(f"{author_name}: {image_post}: {dir_post}: {comment_post}: {title_post}: {link_post}")
news_dict = {
"author_name": author_name,
"image_post": image_post,
"dir_post": dir_post,
"title_post": title_post,
"comment_post": comment_post,
"link_post": link_post
}
print(news_dict)
# with open("news_dict.json", "w") as file:
# json.dump(news_dict, file, indent=4, ensure_ascii=False)
def main():
get_first_news()
if __name__ == "__main__":
main()
写入json时,只保存一篇文章。打印显示了应有的所有内容,但 json 中只有一篇文章...告诉我,我做错了什么?
据我了解,写入文件的唯一文章是整个列表中的最后一篇文章。事实上,在循环的每次迭代结束时,您都会打开文件
news_dict.json
进行覆盖。旧数据被删除,新数据被写入其位置。为了避免这种情况,您应该在循环之前创建一个空列表并在那里累积字典
news_dict
(在每次迭代时使用方法append()
)。运行循环后,您可以使用注释掉的结构将字典列表保存到文件中。您将在循环的每次传递中将字典覆盖到文件中。你可以这样做:
该行
with open("news_dict.json", "w") as file:
打开文件news_dict.json
进行覆盖。正如文档所说:但你需要设置不同的功能模式
open
,puta
。这种操作模式不会覆盖整个文件,而是追加到其末尾。从文档中:也就是说,在您的代码中:
应替换为:
一切都很简单。