이제 필터링을 마친 제품리스트 JSON을 가져와 DB에 저장을 해야 하기에 먼저 각 제품들의 테이블을 초기화하고 애초에 크롤링해 오는 제품들은 인기도순으로 가져오는 것이기 때문에 AUTO_INCREMENT를 사용해 각 제품들의 인기순위를 나타냈다.
with open('HARDWARE_DATA_new/Case_List.json', 'r', encoding='utf-8') as f:
case_data = json.load(f)
with open('HARDWARE_DATA_new/Cooler_List.json', 'r', encoding='utf-8') as f:
cooler_data = json.load(f)
with open('HARDWARE_DATA_new/CPU_List.json', 'r', encoding='utf-8') as f:
cpu_data = json.load(f)
with open('HARDWARE_DATA_new/HDD_List.json', 'r', encoding='utf-8') as f:
hdd_data = json.load(f)
with open('HARDWARE_DATA_new/MBoard_List.json', 'r', encoding='utf-8') as f:
mboard_data = json.load(f)
with open('HARDWARE_DATA_new/power_List.json', 'r', encoding='utf-8') as f:
power_data = json.load(f)
with open('HARDWARE_DATA_new/RAM_List.json', 'r', encoding='utf-8') as f:
ram_data = json.load(f)
with open('HARDWARE_DATA_new/SSD_List.json', 'r', encoding='utf-8') as f:
ssd_data = json.load(f)
with open('HARDWARE_DATA_new/VGA_List.json', 'r', encoding='utf-8') as f:
vga_data = json.load(f)
delete_query1 = "DELETE FROM pc_case"
delete_query2 = "DELETE FROM pc_cooler"
delete_query3 = "DELETE FROM pc_cpu"
delete_query4 = "DELETE FROM pc_hdd"
delete_query5 = "DELETE FROM pc_mboard"
delete_query6 = "DELETE FROM pc_power"
delete_query7 = "DELETE FROM pc_ram"
delete_query8 = "DELETE FROM pc_ssd"
delete_query9 = "DELETE FROM pc_vga"
delete_query10 = "DELETE FROM pc_default"
cursor.execute(delete_query1)
cursor.execute(delete_query2)
cursor.execute(delete_query3)
cursor.execute(delete_query4)
cursor.execute(delete_query5)
cursor.execute(delete_query6)
cursor.execute(delete_query7)
cursor.execute(delete_query8)
cursor.execute(delete_query9)
cursor.execute(delete_query10)
alter_query = "ALTER TABLE pc_case AUTO_INCREMENT = 1"
cursor.execute(alter_query)
alter_query = "ALTER TABLE pc_cooler AUTO_INCREMENT = 1"
cursor.execute(alter_query)
alter_query = "ALTER TABLE pc_cpu AUTO_INCREMENT = 1"
cursor.execute(alter_query)
alter_query = "ALTER TABLE pc_hdd AUTO_INCREMENT = 1"
cursor.execute(alter_query)
alter_query = "ALTER TABLE pc_mboard AUTO_INCREMENT = 1"
cursor.execute(alter_query)
alter_query = "ALTER TABLE pc_power AUTO_INCREMENT = 1"
cursor.execute(alter_query)
alter_query = "ALTER TABLE pc_ram AUTO_INCREMENT = 1"
cursor.execute(alter_query)
alter_query = "ALTER TABLE pc_ssd AUTO_INCREMENT = 1"
cursor.execute(alter_query)
alter_query = "ALTER TABLE pc_vga AUTO_INCREMENT = 1"
cursor.execute(alter_query)
insert_default_cooler = "INSERT INTO pc_cooler (product_num, manufacturer_name, product_name, product_salePrice, product_originalPrice, Socket_Type, Color, product_description, product_IMG) VALUES (0, '기본쿨러', '기본쿨러', '0', '0', '기본', NULL, NULL, NULL)"
cursor.execute(insert_default_cooler)
그 후 각 제품별로 테이터를 집어넣는 작업이 필요했다.
for pc_case in case_data:
manufacturer_name = pc_case.get('brand')
product_name = pc_case.get('name')
product_price = pc_case.get('price')
if product_price == '일시품절':
product_originalPrice = 0
product_salePrice = 0 # '일시품절'인 경우 0으로 할당
elif product_price == '가격비교예정':
product_originalPrice = 0
product_salePrice = 0 # '가격비교예정'인 경우 0으로 할당
elif product_price == '가격비교중지':
product_originalPrice = 0
product_salePrice = 0 # '가격비교예정'인 경우 0으로 할당
else:
product_originalPrice = int(product_price)
product_salePrice = int(product_price) * 0.95
product_description = ';'.join(pc_case.get('spec'))
Size = pc_case.get('spec')[0]
Color = pc_case.get('color_text')
matches = re.search(r'\(([^)]+)\)', Size)
if matches:
C_Size = matches.group(1)
else:
C_Size = Size
match_gpu1 = re.search(r'GPU 장착: (?:최대 )?(\d+)mm', product_description)
match_gpu2 = re.search(r'GPU 장착: (\d+)~(\d+)mm', product_description)
if match_gpu1:
gpu_size = match_gpu1.group(1)
elif match_gpu2:
gpu_size = match_gpu2.group(2)
else:
gpu_size = 0
#start_index = Case_Size.find("(") + 1
#end_index = Case_Size.find(")")
#Size = Case_Size[start_index:end_index]
product_img = pc_case.get('img')
if gpu_size != 0:
insert_query = "INSERT INTO pc_case(manufacturer_name, product_name, product_salePrice, product_originalPrice, Board_Size, GPU_Size, Color, product_description, product_IMG) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
insert_value = (manufacturer_name, product_name, product_salePrice, product_originalPrice, C_Size, gpu_size, Color, product_description, product_img)
cursor.execute(insert_query, insert_value)
for pc_cpu in cpu_data:
manufacturer_name = pc_cpu.get('brand')
product_name = pc_cpu.get('name')
product_price = pc_cpu.get('price')
if product_price == '일시품절':
product_originalPrice = 0
product_salePrice = 0 # '일시품절'인 경우 0으로 할당
elif product_price == '가격비교예정':
product_originalPrice = 0
product_salePrice = 0 # '가격비교예정'인 경우 0으로 할당
elif product_price == '가격비교중지':
product_originalPrice = 0
product_salePrice = 0 # '가격비교예정'인 경우 0으로 할당
else:
product_originalPrice = int(product_price)
product_salePrice = int(product_price) * 0.95
product_description = ';'.join(pc_cpu.get('spec'))
tdp_one = r'TDP: (\d+)W'
tdp_wave = r'TDP: (\d+)~(\d+)W'
PBP_one = r'PBP/MTP: (\d+)W'
PBP_wave = r'PBP/MTP: (\d+)~(\d+)W'
matches1 = re.search(tdp_one, product_description)
if matches1:
tdp_value = matches1.group(1)
else:
matches2 = re.search(tdp_wave, product_description)
if matches2:
tdp_value = matches2.group(2)
else:
matches3 = re.search(PBP_one, product_description)
if matches3:
tdp_value = matches3.group(1)
else:
matches4 = re.search(PBP_wave, product_description)
if matches4:
tdp_value = matches4.group(2)
else:
tdp_value = 0
socket_info = pc_cpu.get('spec')[0]
WhatSocket = list(socket_info)
formatted_socket_info = ""
matches = re.search(r'내장그래픽: (\S+);', product_description)
if matches:
integrated_graphics = matches.group(1)
matches_mem = re.search(r'메모리 규격: ((?:DDR[0-9]+,? ?)+)', product_description)
if matches_mem:
memory_type = matches_mem.group(1).strip()
if WhatSocket[0] == "인":
matches1 = re.search(r'인텔\(소켓(\d+)\)', socket_info)
if matches1:
socket_number = matches1.group(1)
formatted_socket_info = f"LGA{socket_number}"
elif WhatSocket[0] == "A":
matches2 = re.search(r'AMD\(소켓([A-Za-z0-9]+)\)', socket_info)
if matches2:
socket_number = matches2.group(1)
formatted_socket_info = f"{socket_number}"
product_img = pc_cpu.get('img')
def_cooler = re.search(r'쿨러: (.+?);', product_description)
if def_cooler:
cooler_stat = def_cooler.group(1)
insert_query = "INSERT INTO pc_cpu(manufacturer_name, product_name, product_salePrice, product_originalPrice, InterGrated_graphics, TDP, Socket_Type, Memory_Type, Stock_Cooler, product_description, product_IMG) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
insert_value = (manufacturer_name, product_name, product_salePrice, product_originalPrice, integrated_graphics, tdp_value, formatted_socket_info, memory_type, cooler_stat, product_description, product_img)
cursor.execute(insert_query, insert_value)
......
위와같은 방법으로 모든 카테고리 내에 데이터를 집어넣었다.
처음에는 위와 같은 방법으로 데이터를 집어넣고 코드상에서 데이터를 가져와 pc조합을 맞추려 했지만 조합에 대한 값이 너무 방대해져 값이 추출되는데 오래 걸리고 또한 한번 한번 테스트를 하는데 너무 오랜 시간이 걸리는 문제점을 발견했다.
그래서 pc_default 테이블을 새로 만들어 데이터를 정리해서 입력하는 쿼리문을 추가했다.
made_default_query = """INSERT INTO pc_default(
cpu_product_num, cpu_product_name, cpu_product_originalPrice,
cpu_InterGrated_graphics, cpu_TDP, cpu_Stock_Cooler, cpu_Socket_Type,
mboard_product_num, mboard_product_name, mboard_product_originalPrice,
mboard_MBoard_Size, ram_product_num, ram_product_name, ram_product_originalPrice, ram_R_Size,
cooler_product_num, cooler_product_name, cooler_product_originalPrice, cooler_Color
)
SELECT
cpu.product_num, cpu.product_name, cpu.product_originalPrice,
cpu.InterGrated_graphics, cpu.TDP, cpu.Stock_Cooler, cpu.Socket_Type,
mboard.product_num, mboard.product_name, mboard.product_originalPrice,
mboard.MBoard_Size, ram.product_num, ram.product_name, ram.product_originalPrice, ram.R_Size,
cooler.product_num, cooler.product_name, cooler.product_originalPrice, cooler.Color
FROM (
SELECT *
FROM pc_ram
WHERE pc_ram.R_Size <= 32
ORDER BY product_num ASC
) AS ram
JOIN pc_cpu AS cpu ON FIND_IN_SET(ram.Version, cpu.Memory_Type) > 0
JOIN (
SELECT *
FROM pc_mboard
ORDER BY product_num ASC
LIMIT 500
) AS mboard ON cpu.Socket_Type = mboard.Socket
JOIN (
SELECT *
FROM pc_cooler
ORDER BY product_num ASC
LIMIT 10
) AS cooler ON FIND_IN_SET(cpu.Socket_Type, cooler.Socket_Type) > 0
WHERE ABS(CAST(REGEXP_REPLACE(ram.MHz, '[^0-9]', '') AS SIGNED) - mboard.MHz) <= 500;"""
cursor.execute(made_default_query)
위처럼 쿼리문을 추가해 VGA가 필요한 부분이든 아니든 cpu, 메인보드, ram은 무조건 들어가는 부분이기때문에 default에 넣고 프로그램을 돌릴 수 있는 기반을 마련했다.
'실력 향상 일지 > 23년 캡스톤 디자인' 카테고리의 다른 글
3. Python) 데이터 필터링 (0) | 2023.12.09 |
---|---|
2. Python) 크롤링 과정 (1) | 2023.12.09 |
1. 크롤링 준비 (0) | 2023.08.08 |
0. 시작하며 (0) | 2023.07.01 |