Является ли html на этом сайте утилитарным? http://www.customs.go.jp/toukei/srch/indexe.htm?M=03&P=1,2,,,,1,0,2018,0,5,0,2,271111 ,,,,, 1 ,,,,,,,,,,, 50
В настройках сети Chrome Dev Tools в разделе "Документы" загружается 7 htm документов, все из которых, кроме одного, являются запросами GET. Один ответ на запрос POST содержит данные, которые находятся в формате html (этот файл называется JCWSV03), и это данные, к которым я хочу получить доступ. К сожалению, когда я запускаю запрос, я получаю другой html, чем тот, который отображается на веб-странице.
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests as rq
import urllib.request
url = 'http://www.customs.go.jp/toukei/srch/indexe.htm?M=01&P=1,2,,,,,,,,1,0,2017,0,3,0,2,271111000,,,,,,,,,,1,,,,,,,,,,,,,,,,,,,,,,50'
sauce = urllib.request.urlopen(url).read().decode('utf-8')
soup = bs(sauce, 'lxml')
r2 = rq.post(url)
soup, r2.text
Это также не работает:
url2 = 'http://www.customs.go.jp/toukei/srch/jccht00p.htm'
parameters = {'Referer' : 'http://www.customs.go.jp/toukei/srch/jccht03e.htm?&P=1,2,,,,,,,,1,0,2018,0,5,0,2,271111,,,,,,,,,,1,,,,,,,,,,,,,,,,,,,,,,50',
'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'}
r3 = rq.post(url, params = parameters)
r3.text
Представленный html это:
(<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="Copyright (C) Ministry of Finance, The Japanese Government" name="copyright"/>
<meta content="NOINDEX,NOFOLLOW" name="robots"/>
<meta content="text/css" http-equiv="Content-Style-Type"/>
<link href="jcc.css" rel="stylesheet" type="text/css"/>
<title>Trade Statistics ( Search ) :Trade Statistics of Japan Ministry of Finance</title>
</head>
<script language="JavaScript" src="display/jccjs00me.js"></script>
<script language="JavaScript">
<!--
window.onerror=null;
//-->
</script>
<body><noscript>
Unless it turns ON the Javascript function of a browser, search in a site cannot be performed.
</noscript>
<frameset cols="*">
<frame name="FR_M_INFO" src="tope.htm" title="TopPage"/>
</frameset>
</body></html>,
'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">\r\n<html lang="en">\r\n\t<head>\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\r\n\t\t<meta name="copyright" content="Copyright (C) Ministry of Finance, The Japanese Government">\r\n\t\t<meta name="robots" content="NOINDEX,NOFOLLOW">\r\n\t\t<meta http-equiv="Content-Style-Type" content="text/css">\r\n\t\t<link href="jcc.css" rel="stylesheet" type="text/css">\r\n\t\t<title>Trade Statistics ( Search ) :Trade Statistics of Japan Ministry of Finance</title>\t\t\r\n\t</head>\r\n\t\r\n\t<SCRIPT LANGUAGE="JavaScript" SRC="display/jccjs00me.js"></SCRIPT>\r\n\t<SCRIPT LANGUAGE="JavaScript">\r\n\t<!--\r\n\t\twindow.onerror=null;\r\n\t//-->\r\n\t</SCRIPT>\r\n\t\r\n\t<noscript>\r\n\t\tUnless it turns ON the Javascript function of a browser, search in a site cannot be performed.\r\n\t</noscript>\r\n\t\r\n\t<FRAMESET COLS="*">\r\n\t\t<FRAME NAME="FR_M_INFO" SRC="tope.htm" title="TopPage">\r\n\t</FRAMESET>\r\n</html>\r\n')
Пожалуйста, предоставьте руководство! (моя конечная цель, чтобы разобрать html - bs4 и в pandas, цикл за время)
Существует кнопка "CSV-download", которая выдает этот запрос POST. Дублируйте запрос с помощью Curl и проанализируйте данные CSV:
POST /JCWSV03/servlet/JCWSV03 HTTP/1.1
Host: www.customs.go.jp
Connection: keep-alive
Content-Length: 1327
Cache-Control: max-age=0
Origin: http://www.customs.go.jp
Upgrade-Insecure-Requests: 1
DNT: 1
Content-Type: application/x-www-form-urlencoded
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Referer: http://www.customs.go.jp/JCWSV03/servlet/JCWSV03
Accept-Encoding: gzip, deflate
Accept-Language: en-US,en;q=0.9
Cookie: JSESSIONID=UBNQ8NK54PAUN3QUGUN5R3H2IK3QJJ9H7Q8DQ3VJV805T740E70SKKJ4DLI02000A8000000.JCWSV03_001; visid_incap_763612=S8FIHQm2Tgap/mXyryhoy+7RPlsAAAAAQUIPAAAAAACi+fyzQ2Gk1dOZsySNYdbt; incap_ses_208_763612=IwFUalbIKRMxrKSSFPjiAu/RPlsAAAAAZNe3OqD0RhBl1jCtr3682w==
Если вы хотите помочь сделать это на Python с помощью Curl, отправьте комментарий, и я соберу это вместе
Boo ya !:
import requests
cookies = {
'JSESSIONID': 'V4LKVAGE723K5PGBRA8AVENT943QJJ9H7Q8DQ3RF9AKFUKHMQMVTKJJKD1I020005K000000.JCWSV03_001',
'visid_incap_763612': 'vh9PZJxzSX+IQrGov+h+4xvvBFsAAAAAQUIPAAAAAACfECPoFSsLL9R6RNgb2BNn',
'incap_ses_677_763612': 'S7ABVEJemmrXkVaexDBlCYKiPVsAAAAAZdhOPF1KvRtJcnKmfT3GRA==',
'___utmvmOKuDywS': 'rKJenOqgmrg',
'___utmvbOKuDywS': 'sZe XmcOOalV: ltS',
}
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Origin': 'http://www.customs.go.jp',
'Upgrade-Insecure-Requests': '1',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'http://www.customs.go.jp/toukei/srch/jccht00p.htm',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9,en-CA;q=0.8',
}
data = [
('CW_SEARCHID', 'JCCHT03S'),
('CW_JAPANKBN', '2'),
('CW_IMPKBN', '2'),
('CW_CARGOKBN', ''),
('CW_SUMKBN', ''),
('CW_SPCODE', ''),
('CW_SPNAME', ''),
('CW_YMSORTKBN', ''),
('CW_SISUKBN', ''),
('CW_SENKIKBN', ''),
('CW_HKKBN', ''),
('CW_YMKBN', '1'),
('CW_KI', ''),
('CW_SYY', '2017'),
('CW_EYY', ''),
('CW_SMM', '2'),
('CW_EMM', ''),
('CW_HSKBN', '2'),
('CW_HSCODE', '271111'),
('CW_HSCODE', ''),
('CW_HSCODE', ''),
('CW_HSCODE', ''),
('CW_HSCODE', ''),
('CW_HSCODE', ''),
('CW_HSCODE', ''),
('CW_HSCODE', ''),
('CW_HSCODE', ''),
('CW_HSCODE', ''),
('CW_HSNAME', ''),
('CW_HSNAME', ''),
('CW_HSNAME', ''),
('CW_HSNAME', ''),
('CW_HSNAME', ''),
('CW_HSNAME', ''),
('CW_HSNAME', ''),
('CW_HSNAME', ''),
('CW_HSNAME', ''),
('CW_HSNAME', ''),
('CW_KUNIKBN', '1'),
('CW_KUNICODE', ''),
('CW_KUNICODE', ''),
('CW_KUNICODE', ''),
('CW_KUNICODE', ''),
('CW_KUNICODE', ''),
('CW_KUNICODE', ''),
('CW_KUNICODE', ''),
('CW_KUNICODE', ''),
('CW_KUNICODE', ''),
('CW_KUNICODE', ''),
('CW_KUNINAME', ''),
('CW_KUNINAME', ''),
('CW_KUNINAME', ''),
('CW_KUNINAME', ''),
('CW_KUNINAME', ''),
('CW_KUNINAME', ''),
('CW_KUNINAME', ''),
('CW_KUNINAME', ''),
('CW_KUNINAME', ''),
('CW_KUNINAME', ''),
('CW_ZMKBN', ''),
('CW_ZMCODE', ''),
('CW_ZMCODE', ''),
('CW_ZMCODE', ''),
('CW_ZMCODE', ''),
('CW_ZMCODE', ''),
('CW_ZMCODE', ''),
('CW_ZMCODE', ''),
('CW_ZMCODE', ''),
('CW_ZMCODE', ''),
('CW_ZMCODE', ''),
('CW_ZMNAME', ''),
('CW_ZMNAME', ''),
('CW_ZMNAME', ''),
('CW_ZMNAME', ''),
('CW_ZMNAME', ''),
('CW_ZMNAME', ''),
('CW_ZMNAME', ''),
('CW_ZMNAME', ''),
('CW_ZMNAME', ''),
('CW_ZMNAME', ''),
('CW_MEISAICNT', '200'),
]
response = requests.post('http://www.customs.go.jp/JCWSV03/servlet/JCWSV03', data=data)
from bs4 import BeautifulSoup as bs
soup = bs(response.text, 'html.parser')
country = [x.text for x in soup.find_all('td', 'left_sTotal')[0::3]]
quantity = [x.text for x in soup.find_all('td', 'sTotal')[1::6]]
value = [x.text for x in soup.find_all('td', 'sTotal')[2::6]]
total_list = [list(x) for x in zip(country,quantity,value)]
import pandas as pd
df = pd.DataFrame(total_list, columns = ('country', 'quantity', 'value'))
df.head()