来自:
https://www.jb51.net/article/72211.htm
1、推荐的一种方法:php判断搜索引擎蜘蛛爬虫还是人为访问代码,摘自Discuz x3.2
<?php
function
checkrobot(
$useragent
=
''
){
static
$kw_spiders
=
array
(
'bot'
,
'crawl'
,
'spider'
,
'slurp'
,
'sohu-search'
,
'lycos'
,
'robozilla'
);
static
$kw_browsers
=
array
(
'msie'
,
'netscape'
,
'opera'
,
'konqueror'
,
'mozilla'
);
$useragent
=
strtolower
(
empty
(
$useragent
) ?
$_SERVER
[
'HTTP_USER_AGENT'
] :
$useragent
);
if
(
strpos
(
$useragent
,
'http://'
) === false && dstrpos(
$useragent
,
$kw_browsers
))
return
false;
if
(dstrpos(
$useragent
,
$kw_spiders
))
return
true;
return
false;
}
function
dstrpos(
$string
,
$arr
,
$returnvalue
= false) {
if
(
empty
(
$string
))
return
false;
foreach
((
array
)
$arr
as
$v
) {
if
(
strpos
(
$string
,
$v
) !== false) {
$return
=
$returnvalue
?
$v
: true;
return
$return
;
}
}
return
false;
}
if
(checkrobot()){
echo
'机器人爬虫'
;
}
else
{
echo
'人'
;
}
?>
实际应用中可以这样判断,直接不是搜索引擎才执行操作
<?php
if
(!checkrobot()){
//do something
}
?>
2、第二种方法:
使用PHP实现蜘蛛访问日志统计
$useragent
=
addslashes
(
strtolower
(
$_SERVER
[
'HTTP_USER_AGENT'
]));
if
(
strpos
(
$useragent
,
'googlebot'
)!== false){
$bot
=
'Google'
;}
elseif
(
strpos
(
$useragent
,
'mediapartners-google'
) !== false){
$bot
=
'Google Adsense'
;}
elseif
(
strpos
(
$useragent
,
'baiduspider'
) !== false){
$bot
=
'Baidu'
;}
elseif
(
strpos
(
$useragent
,
'sogou spider'
) !== false){
$bot
=
'Sogou'
;}
elseif
(
strpos
(
$useragent
,
'sogou web'
) !== false){
$bot
=
'Sogou web'
;}
elseif
(
strpos
(
$useragent
,
'sosospider'
) !== false){
$bot
=
'SOSO'
;}
elseif
(
strpos
(
$useragent
,
'360spider'
) !== false){
$bot
=
'360Spider'
;}
elseif
(
strpos
(
$useragent
,
'yahoo'
) !== false){
$bot
=
'Yahoo'
;}
elseif
(
strpos
(
$useragent
,
'msn'
) !== false){
$bot
=
'MSN'
;}
elseif
(
strpos
(
$useragent
,
'msnbot'
) !== false){
$bot
=
'msnbot'
;}
elseif
(
strpos
(
$useragent
,
'sohu'
) !== false){
$bot
=
'Sohu'
;}
elseif
(
strpos
(
$useragent
,
'yodaoBot'
) !== false){
$bot
=
'Yodao'
;}
elseif
(
strpos
(
$useragent
,
'twiceler'
) !== false){
$bot
=
'Twiceler'
;}
elseif
(
strpos
(
$useragent
,
'ia_archiver'
) !== false){
$bot
=
'Alexa_'
;}
elseif
(
strpos
(
$useragent
,
'iaarchiver'
) !== false){
$bot
=
'Alexa'
;}
elseif
(
strpos
(
$useragent
,
'slurp'
) !== false){
$bot
=
'雅虎'
;}
elseif
(
strpos
(
$useragent
,
'bot'
) !== false){
$bot
=
'其它蜘蛛'
;}
if
(isset(
$bot
)){
$fp
= @
fopen
(
'bot.txt'
,
'a'
);
fwrite(
$fp
,
date
(
'Y-m-d H:i:s'
).
"\t"
.
$_SERVER
[
"REMOTE_ADDR"
].
"\t"
.
$bot
.
"\t"
.
'http://'
.
$_SERVER
[
'SERVER_NAME'
].
$_SERVER
[
"REQUEST_URI"
].
"\r\n"
);
fclose(
$fp
);
}
第三种方法:
我们可以通过HTTP_USER_AGENT来判断是否是蜘蛛,搜索引擎的蜘蛛都有自己的独特标志,下面列取了一部分。
function
is_crawler() {
$userAgent
=
strtolower
(
$_SERVER
[
'HTTP_USER_AGENT'
]);
$spiders
=
array
(
'Googlebot'
,
// Google 爬虫
'Baiduspider'
,
// 百度爬虫
'Yahoo! Slurp'
,
// 雅虎爬虫
'YodaoBot'
,
// 有道爬虫
'msnbot'
// Bing爬虫
// 更多爬虫关键字
);
foreach
(
$spiders
as
$spider
) {
$spider
=
strtolower
(
$spider
);
if
(
strpos
(
$userAgent
,
$spider
) !== false) {
return
true;
}
}
return
false;
}
下面的php代码附带了更多的蜘蛛标识
function
isCrawler() {
echo
$agent
=
strtolower
(
$_SERVER
[
'HTTP_USER_AGENT'
]);
if
(!
empty
(
$agent
)) {
$spiderSite
=
array
(
"TencentTraveler"
,
"Baiduspider+"
,
"BaiduGame"
,
"Googlebot"
,
"msnbot"
,
"Sosospider+"
,
"Sogou web spider"
,
"ia_archiver"
,
"Yahoo! Slurp"
,
"YoudaoBot"
,
"Yahoo Slurp"
,
"MSNBot"
,
"Java (Often spam bot)"
,
"BaiDuSpider"
,
"Voila"
,
"Yandex bot"
,
"BSpider"
,
"twiceler"
,
"Sogou Spider"
,
"Speedy Spider"
,
"Google AdSense"
,
"Heritrix"
,
"Python-urllib"
,
"Alexa (IA Archiver)"
,
"Ask"
,
"Exabot"
,
"Custo"
,
"OutfoxBot/YodaoBot"
,
"yacy"
,
"SurveyBot"
,
"legs"
,
"lwp-trivial"
,
"Nutch"
,
"StackRambler"
,
"The web archive (IA Archiver)"
,
"Perl tool"
,
"MJ12bot"
,
"Netcraft"
,
"MSIECrawler"
,
"WGet tools"
,
"larbin"
,
"Fish search"
,
);
foreach
(
$spiderSite
as
$val
) {
$str
=
strtolower
(
$val
);
if
(
strpos
(
$agent
,
$str
) !== false) {
return
true;
}
}
}
else
{
return
false;
}
}
if
(isCrawler()){
echo
"你好蜘蛛精!"
;
}
else
{
echo
"你不是蜘蛛精啊!"
;
}
第四种方法:
<?php
$flag
= false;
$tmp
=
$_SERVER
[
'HTTP_USER_AGENT'
];
if
(
strpos
(
$tmp
,
'Googlebot'
) !== false){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'Baiduspider'
) >0){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'Yahoo! Slurp'
) !== false){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'msnbot'
) !== false){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'Sosospider'
) !== false){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'YodaoBot'
) !== false ||
strpos
(
$tmp
,
'OutfoxBot'
) !== false){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'Sogou web spider'
) !== false ||
strpos
(
$tmp
,
'Sogou Orion spider'
) !== false){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'fast-webcrawler'
) !== false){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'Gaisbot'
) !== false){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'ia_archiver'
) !== false){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'altavista'
) !== false){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'lycos_spider'
) !== false){
$flag
= true;
}
else
if
(
strpos
(
$tmp
,
'Inktomi slurp'
) !== false){
$flag
= true;
}
if
(
$flag
== false){
header(
"Location: //www.jb51.net"
.
$_SERVER
[
'REQUEST_URI'
]);
// 自动转到//www.jb51.net 对应的网页
// $_SERVER['REQUEST_URI'] 为域名后面的路径
// 或换成header("Location: //www.jb51.net/abc/d.php");
exit
();
}
?>