2009-05-04 18 views

Respuesta

14
from BeautifulSoup import BeautifulSoup 

soup = BeautifulSoup(''' 
<html> 
    <head><title>Testing</title></head> 
    <body> 
    <a href="http://foo.com/">foo</a> 
    <a href="http://bar.com/bar">Bar</a> 
    </body> 
</html>''') 

for link in soup.findAll('a'): # find all links 
    link['href'] = link['href'] + '?foo' 

print soup 

que imprime:

<html> 
<head><title>Testing</title></head> 
<body> 
<a href="http://foo.com/?foo">foo</a> 
<a href="http://bar.com/bar?foo">Bar</a> 
</body> 
</html> 

El documentation también tiene algunas examples for changing attributes. Es un extenso tutorial que cubre todos los aspectos comunes de BeautifulSoup. No sé qué falta en la documentación, tal vez deberías aclarar.

1

mi ejemplo:

HEADERS = {"User-Agent" : "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5", 
     "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
     "Accept-Language" : "ru,en-us;q=0.7,en;q=0.3", 
     "Accept-Charset" : "windows-1251,utf-8;q=0.7,*;q=0.7", 
     "Accept-Encoding" : "identity, *;q=0", 
     "Connection" : "Keep-Alive"} 
PROXY=None 
timeout=60 


def parse_manuf_page_about(page_str_about): 
slovar={} 
global timeout 
socket.setdefaulttimeout(timeout) 
if PROXY is not None: 
     proxy_handler = urllib2.ProxyHandler({ "http": "http://"+PROXY+"/" }) 
     opener = urllib2.build_opener(proxy_handler) 
     urllib2.install_opener(opener) 
page_request = urllib2.Request(url=page_str_about, headers=HEADERS) 
try: 
    #print "Page reading ... %s" %page_str 
    page_zapr = urllib2.urlopen(url=page_request) 
    page=page_zapr.read() 
except Exception ,error: 
    print str(error) 
    res=False 
    return res,slovar 
soup = BeautifulSoup(page) 
select_pod=soup.findAll('div', {"class":"win aboutUs"}) 

promeg= select_pod[0].findAll("p")[0] 
zerro_br= promeg.findAll(text=True) 
Company_Info=" ".join(zerro_br).strip(" \t\n") 
select =soup.findAll('div', {"class":"win"}) 
cells_tabl= select[0].findAll("tr") 

for yach in cells_tabl: 
    text_zag=yach.findAll("th") 
    for zn_yach in text_zag: 
     if len(zn_yach)>0: 
      txt_zn_yach="".join(zn_yach.findAll(text=True)).strip(" \t\n") 
     else: 
      txt_zn_yach= zn_yach.contents[0].strip(" \t\n") 
      #print txt_zn_yach 
    text_znach_td=yach.findAll("td") 
    for zn_yach_td in text_znach_td: 
     if len(zn_yach_td)>0: 
      txt_zn_yach_td="".join(zn_yach_td.findAll(text=True)).strip(" \t\n") 
     else: 
      txt_zn_yach_td= zn_yach.contents[0].strip(" \t\n") 
      #print txt_zn_yach_td 
    # Делаем замены неугодных символов/Replase browsers char 
    if "&nbsp" in txt_zn_yach_td: 
     while txt_zn_yach_td.find("nbsp;")>0: 
      pos_gavna=txt_zn_yach_td.find("&nbsp;") 
      txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+txt_zn_yach_td[pos_gavna+6:] 
    if "&quot" in txt_zn_yach_td: 
     while txt_zn_yach_td.find("quot;")>0: 
      pos_gavna=txt_zn_yach_td.find("&quot;") 
      txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+'"'+txt_zn_yach_td[pos_gavna+6:] 
    if "&amp;" in txt_zn_yach_td: 
     while txt_zn_yach_td.find("&amp;")>0: 
      pos_gavna=txt_zn_yach_td.find("&amp;") 
      txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+'&'+txt_zn_yach_td[pos_gavna+6:] 
    slovar[str(txt_zn_yach)]=txt_zn_yach_td 
    slovar["Company_Info"]=Company_Info 
# разбираем нижнюю таблицу с контактом и вытаскиваем оттуда имя контакта | get name contacts 
select_contact=soup.findAll('a', {"class":"member-name"}) 
for contact_person in select_contact: 
    slovar["Contact_Person"]= contact_person.contents[0] 
# получаем статус голд партнера по наличию таблички в левом верхнем углу | get Gold status 
select_gold_part=soup.findAll('a', {"class":"memberLogo"}) 
if len(select_gold_part)==0: 
    slovar["Gold member"]="N" 
else: 
    slovar["Gold member"]="Y" 
res=True 
return res,slovar 

Este código de análisis de una página manufactury en Alibaba.com. Usted puede verlo en la página - http://xmxinhuafeng.en.alibaba.com/aboutus.html

+0

¿Funciona realmente el keep-alive? –