简单的pythen脚本,可以筛选出网页内的链接,并且不重样,但是只能扫描一个页面,不能递归查询.
# coding:utf-8
import re
import requests
# 获取网页内容
try:
url = raw_input("Please input the target test url:")
urlxp=[]
i=0
y=0
r = requests.get(url)
data = r.text
# 利用正则查找所有连接
link_list =re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,data)
for url in link_list:
for xp in range(0,len(urlxp)):
if (urlxp[xp]==url):
y+=1
if y==0:
urlxp.append(url)
y=0
else:
y=0
for i in range(0,len(urlxp)):
print i,urlxp[i],"\n"
except:
print("错误的链接!!!");