It reads "Table 1" from an Excel file (需要保留的关键词.xlsx) into a DataFrame called keywords_df.
It reads "Table 2" from a CSV file (原始数据.csv) into another DataFrame called data_df.
创建一个名为的空 DataFrame,result_df其列与相同data_df。
import pandas as pd
from tqdm import tqdm
# Read Table 1
keywords_df = pd.read_excel(r"C:\Users\Desktop\需要保留的关键词.xlsx")
# Read Table 2 (数据表格)
data_df = pd.read_csv(r"C:\Users\Desktop\原始数据.csv", dtype=str)
# Create an empty Table 3
result_df = pd.DataFrame(columns=data_df.columns)
# Iterate over the keywords in Table 1
for keyword in tqdm(keywords_df['关键词'], desc="Processing"):
# Find rows in Table 2 where the "地址" column matches the keyword
matched_rows = data_df[data_df['地址'].str.contains(keyword, na=False)]
# Append the matched rows to Table 3
result_df = pd.concat([result_df, matched_rows], ignore_index=True)
# Remove duplicate rows from Table 3 based on all columns
result_df = result_df.drop_duplicates()
# Save Table 3 to a CSV file
result_df.to_csv(r"C:\Users\Desktop\筛选出包含关键词的行.csv", index=False)
# Print "Query Complete"
print("Query Complete")
第四节 运行示例
原始数据如下:
需要保留的关键词假设如下:
代码运行完毕后(只保留了包含太原市和阳泉市的行):
import pandas as pd
from tqdm import tqdm
# Read Table 1
keywords_df = pd.read_excel(r"C:\Users\Desktop\需要保留的关键词.xlsx")
# Read Table 2 (数据表格)
data_df = pd.read_csv(r"C:\Users\Desktop\原始数据.csv", dtype=str)
# Create an empty Table 3
result_df = pd.DataFrame(columns=data_df.columns)
# Iterate over the keywords in Table 1
for keyword in tqdm(keywords_df['关键词'], desc="Processing"):
# Find rows in Table 2 where the "地址" column matches the keyword
matched_rows = data_df[data_df['地址'].str.contains(keyword, na=False)]
# Append the matched rows to Table 3
result_df = pd.concat([result_df, matched_rows], ignore_index=True)
# Remove duplicate rows from Table 3 based on all columns
result_df = result_df.drop_duplicates()
# Save Table 3 to a CSV file
result_df.to_csv(r"C:\Users\Desktop\筛选出包含关键词的行.csv", index=False)
# Print "Query Complete"
print("Query Complete")
import pandas as pd
from tqdm import tqdm
# Read Table 1
keywords_df = pd.read_excel(r"C:\Users\Desktop\需要删除的关键词.xlsx")
# Read Table 2
data_df = pd.read_csv(r"C:\Users\Desktop\原始数据.csv", dtype=str)
# Iterate over the keywords in Table 1
for keyword in tqdm(keywords_df['删除的关键词'], desc="Processing"):
# Find rows in Table 2 where the "地址" column contains the keyword as a substring
matched_rows = data_df[data_df['地址'].str.contains(keyword, na=False, regex=False)]
# Remove the matched rows from Table 2
data_df = data_df[~data_df['地址'].str.contains(keyword, na=False, regex=False)]
# Save the remaining data to a CSV file
data_df.to_csv(r"C:\Users\Desktop\去除掉包含关键词的行.csv", index=False)
# Print "Query Complete"
print("Query Complete")