import DBHelper  
import CodeHelper  
import urllib2  
from bs4 import BeautifulSoup  
import threading as thread  
import Queue  
import time  
  
class Resource:  
      
    def __init__(self, url, text, content, status):  
        self._url = url  
        self._text = text  
        self._content = content  
        self._status = status  
          
    def insert(self):  
        dbHelper = DBHelper.DBHelper()  
        sql = 'select * from resource where url=%s'  
        data = dbHelper.read(sql, [self._url])  
        if data is not None :  
            return  
        sql = 'insert into resource(url,text,content,status) values(%s,%s,%s,%s)'  
        print 'url: %s content: %s status: %s' %(self._url, self._text, self._content, self._status)  
        dbHelper.execute(sql, [self._url, self._text, self._content, self._status]);  
        dbHelper.commint()  
        dbHelper.close()  
          
    def updateStatus(self):  
        dbHelper = DBHelper.DBHelper()  
        sql = 'update resource set status=%s where url=%s'  
        dbHelper.execute(sql, [self._status, self._url]);  
        dbHelper.commint()  
        dbHelper.close()  
          
    def updateContentAndStatus(self):  
        dbHelper = DBHelper.DBHelper()  
        sql = 'update resource set content=%s,status=%s where url=%s'  
        dbHelper.execute(sql, [self._content, self._status, self._url]);  
        dbHelper.commint()  
        dbHelper.close()  
          
    def readListByStatus(self):  
        dbHelper = DBHelper.DBHelper()  
        sql = 'select * from resource where status=%s'  
        return dbHelper.readList(sql, [self._status]);  
          
    def readList(self):  
        dbHelper = DBHelper.DBHelper()  
        return dbHelper.readList('select * from resource');  
          
class ResourceThread(thread.Thread):  
      
    def __init__(self, task_queue):  
        thread.Thread.__init__(self)  
        self._task_queue = task_queue  
        self.setDaemon(True)  
        self.start()  
      
    def run(self):  
        print 'current thread name %s' %thread.currentThread().name  
        while True :  
            try :  
                func, args = self._task_queue.get(block = False)  
                func(args)  
                self._task_queue.task_done()  
            except Exception,e :  
                print str(e)  
                
break  
              
class ResourceManager:  
      
    def __init__(self, taskNum = 10, threadNum = 2) :  
        self._task_queue = Queue.Queue()  
        self._threads = []  
        self.__init__task_queue__(taskNum)  
        self.__init__thread_pool(threadNum)  
          
    def __init__task_queue__(self, taskNum) :  
        for i in range(taskNum) :  
            print 'this is %s task' %i  
            self.add_task(do_task, i)  
          
    def __init__thread_pool(self, threadNum) :  
        for i in range(threadNum) :  
            print 'threadNum %s' %i  
            resourceThread = ResourceThread(self._task_queue)  
            self._threads.append(resourceThread)  
              
    def add_task(self, func, *args) :  
        self._task_queue.put((func, args))  
      
    def check_queue(self):  
        return self._task_queue.qsize()  
      
    def wait_for_complete(self) :  
        for thread_item in self._threads :  
            if thread_item.isAlive() :  
                thread_item.join()  
      
def do_task(args):  
    print 'this task args %s' %args  
    resource = Resource(None, None, None, 0)  
    data = resource.readListByStatus()  
    print 'read status 0 data is %s' %data  
    if data is None :  
        return  
    for item in data :  
        url = item[1]  
        if url is None or url.find('http://') == -1 :  
            
continue  
        content = urllib2.urlopen(url).read()  
        html = BeautifulSoup(content)  
        fetch_resource = Resource(url, None, str(html.find('body'))[0:9999], 1)  
        fetch_resource.updateContentAndStatus()  
        aLinks = html.find_all('a')  
        print 'aLinks %s' %aLinks  
        for aLink in aLinks :  
            href = aLink.get('href')  
            a_text = CodeHelper.encodeContent(aLink.get_text())  
            print 'href %s text %s' %(href, a_text)  
            subResource = Resource(href, a_text, '', 0)  
            subResource.insert()  
              
def execute():  
    urls = ['http://www.kuwo.cn', 'http://www.1ting.com/', 'http://www.kugou.com/', 'http://y.**.com/']  
    for url in urls :  
        resource = Resource(url, None, 0)  
        resource.insert()  
      
    start = time.time()  
    resource_manager =  ResourceManager(20, 4)  
    resource_manager.wait_for_complete()  
    end = time.time()  
    print "cost all time: %s" % (end-start)  
  
if __name__ == '__main__':  
    execute()