做网站最重要的其中一环肯定是收录,页面没有收录,其他都是空谈,更不会有搜索流量。由于每个行业的网站众多,如何让搜索引擎第一时间发现你的网站页面并且收录呢,百度的主动推送操作不能丢。每当你在百度站长平台提交网站后,百度也会提示你去主动推送URL让蜘蛛第一时间去抓取收录,收录上去了,SEO才有希望。对于海洋CMS专门做影视的程序,百度自动推送URL的功能并不完善,所以做一下教程来实现。
1、在根目录新建一个map目录;
2、在map目录里新建一个index.php,文件代码内容如下:
- <?php
- require_once(dirname(__FILE__)."/../include/common.php");
- //前置跳转start
- $cs=$_SERVER["REQUEST_URI"];
- if($GLOBALS['cfg_mskin']==3 AND $GLOBALS['isMobile']==1){header("location:$cfg_mhost$cs");}
- if($GLOBALS['cfg_mskin']==4 AND $GLOBALS['isMobile']==1){header("location:$cfg_mhost");}
- //前置跳转end
- require_once(sea_INC."/main.class.php");
- header('Content-Type:text/xml;charset=UTF-8');
- if($GLOBALS['cfg_runmode']==2||$GLOBALS['cfg_paramset']==0){
- $paras=str_replace(getfileSuffix(),'',$_SERVER['QUERY_STRING']);
- if(strpos($paras,"-")>0){
- $parasArray=explode("-",$paras);
- $tid=$parasArray[0];
- $page=$parasArray[1];
- }else{
- $tid=intval($paras);
- $page=1;
- }
- $tid = isset($tid) && is_numeric($tid) ? $tid : 0;
- $page = isset($page) && is_numeric($page) ? $page : 1;
- }else{
- $tid = $$GLOBALS['cfg_paramid'];
- $page = $$GLOBALS['cfg_parampage'];
- $tid = isset($tid) && is_numeric($tid) ? $tid : 0;
- $page = isset($page) && is_numeric($page) ? $page : 1;
- }
- $tid=intval($tid);
- $page=intval($page);
- //if($tid==0){
- // showmsg('参数丢失,请返回!', -1);
- // exit;
- //}
- $GLOBALS[tid]=$tid;
- echoChannel($tid);
- function echoChannel($typeId)
- {
- global $dsql,$cfg_iscache,$mainClassObj,$page,$t1,$cfg_user,$cfg_basehost;
- $channelTmpName=getTypeTemplate($typeId);
- $channelTmpName=empty($channelTmpName) ? "channel.html" : $channelTmpName;
- $channelTemplatePath = "/map/channel.html";
- if($GLOBALS['cfg_mskin']!=0 AND $GLOBALS['cfg_mskin']!=3 AND $GLOBALS['cfg_mskin']!=4 AND $GLOBALS['isMobile']==1)
- {$channelTemplatePath = "/map/channel.html";}
- //if (strpos(" ,".getHideTypeIDS().",",",".$typeId.",")>0) exit("<font color='red'>视频列表为空或被隐藏</font><br>");
- //if ($cfg_user == 1){
- // if (!getUserAuth($typeId, "list")){ShowMsg("您当前的会员级别没有权限浏览此内容!","../member.php",0,20000);exit();}
- //}
- $pSize = getPageSizeOnCache($channelTemplatePath,"channel",$channelTmpName);
- if (empty($pSize)) $pSize=12;
- $typeIds = getTypeId($typeId);
- $typename=getTypeName($typeId);
- if($typeId!="")
- $extrasql = " or FIND_IN_SET('".$typeId."',v_extratype)<>0 ";
- else
- $extrasql = "";
- $sql="select count(*) as dd from sea_data where (tid in (".$typeIds.") ".$extrasql.")";
- $row = $dsql->GetOne($sql);
- if(is_array($row))
- {
- $TotalResult = $row['dd'];
- }
- else
- {
- $TotalResult = 0;
- }
- $pCount = ceil($TotalResult/$pSize);
- $currentTypeId = $typeId;
- $cacheName = "parse_channel_".$currentTypeId.$GLOBALS['cfg_mskin'].$GLOBALS['isMobile'];
- if($cfg_iscache){
- if(chkFileCache($cacheName)){
- $content = getFileCache($cacheName);
- }else{
- $content = parseChannelPart($channelTemplatePath,$currentTypeId);
- $content = str_replace("{channelpage:typename}",$typename,$content);
- $content = str_replace("{channelpage:typeid}",$currentTypeId,$content);
- setFileCache($cacheName,$content);
- }
- }else{
- $content = parseChannelPart($channelTemplatePath,$currentTypeId);
- $content = str_replace("{channelpage:typename}",$typename,$content);
- $content = str_replace("{channelpage:typeid}",$currentTypeId,$content);
- }
- $content = str_replace("{channelpage:page}",$page,$content);
- $content=$mainClassObj->ParsePageList($content,$typeIds,$page,$pCount,$TotalResult,"channel",$currentTypeId);
- $content=$mainClassObj->parseIf($content);
- $content=str_replace("{seacms:member}",front_member(),$content);
- $content = str_replace("{channelpage:order-hit-link}",$cfg_basehost."/search.php?page=1&searchtype=5&order=hit&tid=".$typeId,$content);
- $content = str_replace("{channelpage:order-hitasc-link}",$cfg_basehost."/search.php?page=1&searchtype=5&order=hitasc&tid=".$typeId,$content);
- $content = str_replace("{channelpage:order-id-link}",$cfg_basehost."/search.php?page=1&searchtype=5&order=id&tid=".$typeId,$content);
- $content = str_replace("{channelpage:order-idasc-link}",$cfg_basehost."/search.php?page=1&searchtype=5&order=idasc&tid=".$typeId,$content);
- $content = str_replace("{channelpage:order-time-link}",$cfg_basehost."/search.php?page=1&searchtype=5&order=time&tid=".$typeId,$content);
- $content = str_replace("{channelpage:order-timeasc-link}",$cfg_basehost."/search.php?page=1&searchtype=5&order=timeasc&tid=".$typeId,$content);
- $content = str_replace("{channelpage:order-commend-link}",$cfg_basehost."/search.php?page=1&searchtype=5&order=commend&tid=".$typeId,$content);
- $content = str_replace("{channelpage:order-commendasc-link}",$cfg_basehost."/search.php?page=1&searchtype=5&order=commendasc&tid=".$typeId,$content);
- $content = str_replace("{channelpage:order-score-link}",$cfg_basehost."/search.php?page=1&searchtype=5&order=score&tid=".$typeId,$content);
- $content = str_replace("{channelpage:order-scoreasc-link}",$cfg_basehost."/search.php?page=1&searchtype=5&order=scoreasc&tid=".$typeId,$content);
- echo str_replace("{seacms:runinfo}",getRunTime($t1),$content) ;
- }
- function parseChannelPart($templatePath,$currentTypeId)
- {
- global $mainClassObj;
- $content=loadFile(sea_ROOT.$templatePath);
- $content=$mainClassObj->parseTopAndFoot($content);
- $content = str_replace("{seacms:currenttypeid}",$currentTypeId,$content);
- $content=$mainClassObj->parseSelf($content);
- $content=$mainClassObj->parseHistory($content);
- $content=$mainClassObj->parseGlobal($content);
- $content=$mainClassObj->parseMenuList($content,"",$currentTypeId);
- $content=$mainClassObj->parseAreaList($content);
- $content=$mainClassObj->parseVideoList($content,$currentTypeId);
- $content=$mainClassObj->parseNewsList($content,$currentTypeId);
- $content=$mainClassObj->parseTopicList($content);
- $content = str_replace("{channelpage:typetext}",getTypeText($currentTypeId),$content);
- $content = str_replace("{channelpage:keywords}",getTypeKeywords($currentTypeId),$content);
- $content = str_replace("{channelpage:description}",getTypeDescription($currentTypeId),$content);
- $content = str_replace("{channelpage:title}",getTypeTitle($currentTypeId),$content);
- return $content;
- }
- ?>
3、在map目录下新建一个channel.html文件,代码内容如下:
- <?xml version="1.0" encoding="utf-8"?>
- <urlset>
- {seacms:channellist size=2000 order=time}
- <url>
- <loc>{seacms:siteurl}[channellist:link]</loc>
- <lastmod>[channellist:time style=yyyy-mm-dd]</lastmod>
- <changefreq>daily</changefreq>
- <priority>0.8</priority>
- </url>
- {/seacms:channellist}
- </urlset>
4、从xml文件取数据并做百度主动推送,代码如下:
- #coding:utf-8
- import requests,time,re,os
- import sys
- reload(sys)
- sys.setdefaultencoding('utf-8')
- def main():
- # 删掉yesterday文件
- # if os.path.exists('yesterday.txt'):
- # os.remove('yesterday.txt')
- #把xml中的数据拿下来,并和现有的数据去重后,留下的数据单独放到一个文件,并且追加到所有的url txt里
- url = 'http://yp.jd.com/00/00_0.xml'
- r = requests.get(url)
- zhishi_url = re.findall(r'<loc>(.*?)</loc>',r.content)
- has_push_list = [url.strip() for url in open('all_url.txt')]
- f = open('all_url.txt',r'a+')#所有的url
- f_ytd = open('yesterday_0.txt',r'w+')#昨天发布的文章url
- f_ytd_m = open('yesterday_m_0.txt',r'w+')#昨天发布的文章url(m)
- num = 0
- txt_index = 0
- for link in zhishi_url:#多
- if link in has_push_list:
- pass
- else:
- f.write(link+'\n')#追加到所有的url txt里
- f_ytd.write(link+'\n')#把还未推送的url放到单独的文件内
- f_ytd_m.write(link.replace('www','m')+'\n')#把还未推送的url放到单独的文件内(m)
- if num%2000 == 1999:
- f_ytd.close()
- txt_index += 1
- f_ytd = open('yesterday_%s.txt'%txt_index,r'w+')
- f_ytd_m = open('yesterday_m_%s.txt'%txt_index,r'w+')
- num += 1
- f.close()
- f_ytd.close()
- f_ytd_m.close()
- print 'yesterday has %s'%num
- print 'crawl done'
- time.sleep(5)
- #开始推送
- print 'push begin'
- for i in range(0,txt_index+1):
- try:
- headers = {'Content-Type':'text/plain'}
- url = 'http://data.zz.baidu.com/urls'
- params = {'site':'www.jd.com','token':'00'}#,'type':'original'
- r = requests.post(url,params=params,headers=headers,data=open('yesterday_%s.txt'%i,r'rb').read())
- #m
- params_m = {'site':'m.jd.com','token':'00'}#,'type':'original'
- r_m = requests.post(url,params=params_m,headers=headers,data=open('yesterday_m_
- %s.txt'%i,r'rb').read())
- print 'PC:'+r.content+','+'M:'+r_m.content
- except Exception,e:
- print e
- continue
- print 'Finish!!!'
- if __name__ == '__main__':
- while True:
- current_time = time.localtime(time.time())
- if((current_time.tm_hour == 18) and (current_time.tm_min == 0) and (current_time.tm_sec == 0)):
- main()
百度自动推送的好处就是主动,而sitemap的操作是被动,需要搜索引擎蜘蛛主动去抓。但是需要注意的一点是,自动推送不要重复去推,会影响蜘蛛抓取,浪费蜘蛛资源。除了要推送,还有个要注意的地方,海洋CMS的演员链接是动态的URL,并不利于SEO,海洋CMS伪静态位置好尤其重要.