download_dataset.py 2.6 KB
Newer Older
HypoX64's avatar
HypoX64 已提交
1 2 3 4 5
#-*-coding:utf-8 -*- 
import requests
import re
import threading
import os
HypoX64's avatar
HypoX64 已提交
6
import hashlib
HypoX64's avatar
HypoX64 已提交
7 8 9
headers = {
            'User-Agent':'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/73.0.3683.75 Chrome/73.0.3683.75 Safari/537.36'
        }
HypoX64's avatar
HypoX64 已提交
10

11
def download(url,name,savedir):
HypoX64's avatar
HypoX64 已提交
12
    r=requests.get(url, headers, timeout = 30)
13
    f=open(os.path.join(savedir,name),"wb")
HypoX64's avatar
HypoX64 已提交
14 15 16
    f.write(r.content)
    f.close()

HypoX64's avatar
HypoX64 已提交
17 18
def compare_md5(filepath,md5s):
    if os.path.exists(filepath):
19 20 21 22 23 24
        try:
            md5file=open(filepath,'rb')
            md5=hashlib.md5(md5file.read()).hexdigest()
            md5file.close()
        except Exception as e:
            return False
HypoX64's avatar
HypoX64 已提交
25 26 27 28 29 30 31
        if md5 in md5s:
            return True
    else:
        return False


def downloader(url,filenames,md5s,dir):
HypoX64's avatar
HypoX64 已提交
32
    for name in filenames:
HypoX64's avatar
HypoX64 已提交
33
        filepath  = os.path.join(dir,name)
HypoX64's avatar
HypoX64 已提交
34
        print('Download:',name)
HypoX64's avatar
HypoX64 已提交
35
        while not compare_md5(filepath,md5s):
HypoX64's avatar
HypoX64 已提交
36
            try:
37
                download(url+name+'?download',name,dir)
HypoX64's avatar
HypoX64 已提交
38 39
            except Exception as e:
                print('Warning:',name,'download failed! we will try again')
40 41

def rundownloader(url,filenames,md5s,dir,ThreadNum=4):
HypoX64's avatar
HypoX64 已提交
42 43
    perthread=int(len(filenames)/ThreadNum)
    for i in range(0,ThreadNum):
HypoX64's avatar
HypoX64 已提交
44
        t = threading.Thread(target=downloader,args=(url,filenames[perthread*i:perthread*(1+i)],md5s,dir,))
HypoX64's avatar
HypoX64 已提交
45
        t.start()
HypoX64's avatar
HypoX64 已提交
46
    t = threading.Thread(target=downloader,args=(url,filenames[perthread*ThreadNum:],md5s,dir,))
HypoX64's avatar
HypoX64 已提交
47 48 49
    t.start()


HypoX64's avatar
HypoX64 已提交
50
savedir = './datasets/sleep-edfx/'
51 52
url = 'https://physionet.org/files/sleep-edfx/1.0.0/'
# https://physionet.org/files/sleep-edfx/1.0.0/sleep-cassette/SC4001E0-PSG.edf?download
HypoX64's avatar
HypoX64 已提交
53

54
MD5SUMS=open(os.path.join(savedir,'md5/sleep-cassette_MD5SUMS.txt'),'rb')
HypoX64's avatar
HypoX64 已提交
55 56 57 58 59 60 61
MD5SUMS = MD5SUMS.read()
MD5SUMS=MD5SUMS.decode('utf-8')
MD5SUMS = MD5SUMS.split()
md5s = MD5SUMS[::2]
filenames = MD5SUMS[::-2]
print('start download sleep-edfx/sleep-cassette')
rundownloader(url+'sleep-cassette/',filenames,md5s,savedir)
HypoX64's avatar
HypoX64 已提交
62

63
MD5SUMS=open(os.path.join(savedir,'md5/sleep-telemetry_MD5SUMS.txt'),'rb')
HypoX64's avatar
HypoX64 已提交
64 65 66 67 68 69 70
MD5SUMS = MD5SUMS.read()
MD5SUMS=MD5SUMS.decode('utf-8')
MD5SUMS = MD5SUMS.split()
md5s = MD5SUMS[::2]
filenames = MD5SUMS[::-2]
print('start download sleep-edfx/sleep-telemetry')
rundownloader(url+'sleep-telemetry/',filenames,md5s,savedir)
HypoX64's avatar
HypoX64 已提交
71

HypoX64's avatar
HypoX64 已提交
72 73 74 75 76 77 78 79 80
# soup,page_info=RequestWeb(url)
# links = soup.find_all('a',href=re.compile(r".edf"))
# filenames = []
# for link in links[1:]:
#     begin = str(link).index('">')
#     stop = str(link).index('</a>')
#     filename = str(link)[begin+2:stop]
#     filenames.append(filename)
#rundownloader(url,filenames,md5s,savedir)
HypoX64's avatar
HypoX64 已提交
81