目前,我正在使用h5py生成hdf5数据集.我有类似的东西
import h5py import numpy as np my_data=np.genfromtxt("/tmp/data.csv",delimiter=",",dtype=None,names=True) myFile="/tmp/f.hdf" with h5py.File(myFile,"a") as f: dset = f.create_dataset('%s/%s'%(vendor,dataSet),data=my_data,compression="gzip",compression_opts=9)
这适用于相对较大的ASCII文件(400MB).我想对更大的数据集(40GB)做同样的事情.使用h5py有更好或更有效的方法吗?我想避免将整个数据集加载到内存中.
有关数据的一些信息:
我不知道数据的类型.理想情况下,我想用dtype=None
从np.loadtxt()
我不知道文件的大小(尺寸).他们各不相同
ali_m.. 18
您可以通过在文本文件的开头读取较小的行来推断数据的dtypes.获得这些后,您可以创建可调整大小的HDF5数据集,并迭代地将文本文件中的行块写入其中.
这是一个生成器,它从文本文件中生成连续的行块作为numpy数组:
import numpy as np import warnings def iter_genfromtxt(path, chunksize=100, **kwargs): """Yields consecutive chunks of rows from a text file as numpy arrays. Args: path: Path to the text file. chunksize: Maximum number of rows to yield at a time. **kwargs: Additional keyword arguments are passed to `np.genfromtxt`, with the exception of `skip_footer` which is unsupported. Yields: A sequence of `np.ndarray`s with a maximum row dimension of `chunksize`. """ names = kwargs.pop('names', None) max_rows = kwargs.pop('max_rows', None) skip_header = kwargs.pop('skip_header', kwargs.pop('skiprows', 0)) if kwargs.pop('skip_footer', None) is not None: warnings.warn('`skip_footer` will be ignored') with open(path, 'rb') as f: # The first chunk is handled separately, since we may wish to skip rows, # read column headers etc. chunk = np.genfromtxt(f, max_rows=chunksize, skip_header=skip_header, names=names, **kwargs) # Ensure that subsequent chunks have consistent dtypes and field names kwargs.update({'dtype':chunk.dtype}) while len(chunk): yield chunk[:max_rows] if max_rows is not None: max_rows -= len(chunk) if max_rows <= 0: raise StopIteration chunk = np.genfromtxt(f, max_rows=chunksize, **kwargs)
现在假设我们有一个.csv
文件包含:
strings,ints,floats a,1,0.1256290043 b,2,0.0071402451 c,3,0.2551627907 d,4,0.7958570533 e,5,0.8968247722 f,6,0.7291124437 g,7,0.4196829806 h,8,0.398944394 i,9,0.8718244087 j,10,0.67605461 k,11,0.7105670336 l,12,0.6341504091 m,13,0.1324232855 n,14,0.7062503808 o,15,0.1915132527 p,16,0.4140093777 q,17,0.1458217602 r,18,0.1183596433 s,19,0.0014556247 t,20,0.1649811301
我们可以一次以5行的形式读取这些数据,并将生成的数组写入可调整大小的数据集:
import h5py # Initialize the generator gen = iter_genfromtxt('/tmp/test.csv', chunksize=5, delimiter=',', names=True, dtype=None) # Read the first chunk to get the column dtypes chunk = next(gen) dtype = chunk.dtype row_count = chunk.shape[0] with h5py.File('/tmp/test.h5', 'w') as f: # Initialize a resizable dataset to hold the output maxshape = (None,) + chunk.shape[1:] dset = f.create_dataset('data', shape=chunk.shape, maxshape=maxshape, chunks=chunk.shape, dtype=chunk.dtype) # Write the first chunk of rows dset[:] = chunk for chunk in gen: # Resize the dataset to accommodate the next chunk of rows dset.resize(row_count + chunk.shape[0], axis=0) # Write the next chunk dset[row_count:] = chunk # Increment the row count row_count += chunk.shape[0]
输出:
with h5py.File('/tmp/test.h5', 'r') as f: print(repr(f['data'][:])) # array([(b'a', 1, 0.1256290043), (b'b', 2, 0.0071402451), # (b'c', 3, 0.2551627907), (b'd', 4, 0.7958570533), # (b'e', 5, 0.8968247722), (b'f', 6, 0.7291124437), # (b'g', 7, 0.4196829806), (b'h', 8, 0.398944394), # (b'i', 9, 0.8718244087), (b'j', 10, 0.67605461), # (b'k', 11, 0.7105670336), (b'l', 12, 0.6341504091), # (b'm', 13, 0.1324232855), (b'n', 14, 0.7062503808), # (b'o', 15, 0.1915132527), (b'p', 16, 0.4140093777), # (b'q', 17, 0.1458217602), (b'r', 18, 0.1183596433), # (b's', 19, 0.0014556247), (b't', 20, 0.1649811301)], # dtype=[('strings', 'S1'), ('ints', '对于您的数据集,您可能希望使用更大的块.
1> ali_m..:您可以通过在文本文件的开头读取较小的行来推断数据的dtypes.获得这些后,您可以创建可调整大小的HDF5数据集,并迭代地将文本文件中的行块写入其中.
这是一个生成器,它从文本文件中生成连续的行块作为numpy数组:
import numpy as np import warnings def iter_genfromtxt(path, chunksize=100, **kwargs): """Yields consecutive chunks of rows from a text file as numpy arrays. Args: path: Path to the text file. chunksize: Maximum number of rows to yield at a time. **kwargs: Additional keyword arguments are passed to `np.genfromtxt`, with the exception of `skip_footer` which is unsupported. Yields: A sequence of `np.ndarray`s with a maximum row dimension of `chunksize`. """ names = kwargs.pop('names', None) max_rows = kwargs.pop('max_rows', None) skip_header = kwargs.pop('skip_header', kwargs.pop('skiprows', 0)) if kwargs.pop('skip_footer', None) is not None: warnings.warn('`skip_footer` will be ignored') with open(path, 'rb') as f: # The first chunk is handled separately, since we may wish to skip rows, # read column headers etc. chunk = np.genfromtxt(f, max_rows=chunksize, skip_header=skip_header, names=names, **kwargs) # Ensure that subsequent chunks have consistent dtypes and field names kwargs.update({'dtype':chunk.dtype}) while len(chunk): yield chunk[:max_rows] if max_rows is not None: max_rows -= len(chunk) if max_rows <= 0: raise StopIteration chunk = np.genfromtxt(f, max_rows=chunksize, **kwargs)现在假设我们有一个
.csv
文件包含:strings,ints,floats a,1,0.1256290043 b,2,0.0071402451 c,3,0.2551627907 d,4,0.7958570533 e,5,0.8968247722 f,6,0.7291124437 g,7,0.4196829806 h,8,0.398944394 i,9,0.8718244087 j,10,0.67605461 k,11,0.7105670336 l,12,0.6341504091 m,13,0.1324232855 n,14,0.7062503808 o,15,0.1915132527 p,16,0.4140093777 q,17,0.1458217602 r,18,0.1183596433 s,19,0.0014556247 t,20,0.1649811301我们可以一次以5行的形式读取这些数据,并将生成的数组写入可调整大小的数据集:
import h5py # Initialize the generator gen = iter_genfromtxt('/tmp/test.csv', chunksize=5, delimiter=',', names=True, dtype=None) # Read the first chunk to get the column dtypes chunk = next(gen) dtype = chunk.dtype row_count = chunk.shape[0] with h5py.File('/tmp/test.h5', 'w') as f: # Initialize a resizable dataset to hold the output maxshape = (None,) + chunk.shape[1:] dset = f.create_dataset('data', shape=chunk.shape, maxshape=maxshape, chunks=chunk.shape, dtype=chunk.dtype) # Write the first chunk of rows dset[:] = chunk for chunk in gen: # Resize the dataset to accommodate the next chunk of rows dset.resize(row_count + chunk.shape[0], axis=0) # Write the next chunk dset[row_count:] = chunk # Increment the row count row_count += chunk.shape[0]输出:
with h5py.File('/tmp/test.h5', 'r') as f: print(repr(f['data'][:])) # array([(b'a', 1, 0.1256290043), (b'b', 2, 0.0071402451), # (b'c', 3, 0.2551627907), (b'd', 4, 0.7958570533), # (b'e', 5, 0.8968247722), (b'f', 6, 0.7291124437), # (b'g', 7, 0.4196829806), (b'h', 8, 0.398944394), # (b'i', 9, 0.8718244087), (b'j', 10, 0.67605461), # (b'k', 11, 0.7105670336), (b'l', 12, 0.6341504091), # (b'm', 13, 0.1324232855), (b'n', 14, 0.7062503808), # (b'o', 15, 0.1915132527), (b'p', 16, 0.4140093777), # (b'q', 17, 0.1458217602), (b'r', 18, 0.1183596433), # (b's', 19, 0.0014556247), (b't', 20, 0.1649811301)], # dtype=[('strings', 'S1'), ('ints', '对于您的数据集,您可能希望使用更大的块.