Coverage for /builds/ase/ase/ase/io/ulm.py : 90.00%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2ULM files
3=========
5*Simple and efficient pythonic file-format*
7Stores ndarrays as binary data and Python's built-in datatypes
8(bool, int, float, complex, str, dict, list, tuple, None) as json.
10.. autofunction:: open
11.. autoexception:: InvalidULMFileError
14File layout
15-----------
17When there is only a single item::
19 0: "- of Ulm" (magic prefix, ascii)
20 8: " " (tag, ascii)
21 24: version (int64)
22 32: nitems (int64)
23 40: 48 (position of offsets, int64)
24 48: p0 (offset to json data, int64)
25 56: array1, array2, ... (8-byte aligned ndarrays)
26 p0: n (length of json data, int64)
27 p0+8: json data
28 p0+8+n: EOF
31Examples
32--------
34Writing:
36>>> import numpy as np
37>>> import ase.io.ulm as ulm
38>>> with ulm.open('x.ulm', 'w') as w:
39... w.write(a=np.ones(7), b=42, c='abc')
40... w.write(d=3.14)
43Reading:
45>>> r = ulm.open('x.ulm')
46>>> print(r.c)
47abc
48>>> r.close()
50To see what's inside 'x.ulm' do this::
52 $ ase ulm x.ulm
53 x.ulm (tag: "", 1 item)
54 item #0:
55 {
56 a: <ndarray shape=(7,) dtype=float64>,
57 b: 42,
58 c: abc,
59 d: 3.14}
62.. autoclass:: Writer
63 :members:
65.. autoclass:: Reader
66 :members:
69More examples
70-------------
72In the following we append to the ulm-file from above and demonstrae
73how to write a big array in chunks:
75>>> w = ulm.open('x.ulm', 'a')
76>>> w.add_array('bigarray', (10, 1000), float)
77>>> for i in range(10):
78... w.fill(np.ones(1000))
79...
80>>> w.close()
82Now read first and second items:
84>>> with ulm.open('x.ulm') as r:
85... print(r.keys())
86dict_keys(['a', 'b', 'c', 'd'])
87>>> with ulm.open('x.ulm', index=1) as r:
88... print(r.keys())
89dict_keys(['bigarray'])
91To get all the data, it is possible to iterate over the items in the file.
93>>> for i, r in enumerate(ulm.Reader('x.ulm')):
94... for k in r.keys():
95... print(i, k)
960 a
970 b
980 c
990 d
1001 bigarray
101>>> r.close()
103The different parts (items) of the file are numbered by the index
104argument:
106>>> r = ulm.Reader('x.ulm')
107>>> r[1].bigarray.shape
108(10, 1000)
109>>> r.close()
112Versions
113--------
1151) Initial version.
1172) Added support for big endian machines. Json data may now have
118 _little_endian=False item.
1203) Changed magic string from "AFFormat" to "- of Ulm".
121"""
123import numbers
124from pathlib import Path
125from typing import Union, Set
127import numpy as np
129from ase.io.jsonio import encode, decode
130from ase.utils import plural
133VERSION = 3
134N1 = 42 # block size - max number of items: 1, N1, N1*N1, N1*N1*N1, ...
137def open(filename, mode='r', index=None, tag=None):
138 """Open ulm-file.
140 filename: str
141 Filename.
142 mode: str
143 Mode. Must be 'r' for reading, 'w' for writing to a new file
144 (overwriting an existing one) or 'a' for appending to an existing file.
145 index: int
146 Index of item to read. Defaults to 0.
147 tag: str
148 Magic ID string.
150 Returns a :class:`Reader` or a :class:`Writer` object. May raise
151 :class:`InvalidULMFileError`.
152 """
153 if mode == 'r':
154 assert tag is None
155 return Reader(filename, index or 0)
156 if mode not in 'wa':
157 2 / 0
158 assert index is None
159 return Writer(filename, mode, tag or '')
162ulmopen = open
165def align(fd):
166 """Advance file descriptor to 8 byte alignment and return position."""
167 pos = fd.tell()
168 r = pos % 8
169 if r == 0:
170 return pos
171 fd.write(b'#' * (8 - r))
172 return pos + 8 - r
175def writeint(fd, n, pos=None):
176 """Write 64 bit integer n at pos or current position."""
177 if pos is not None:
178 fd.seek(pos)
179 a = np.array(n, np.int64)
180 if not np.little_endian:
181 a.byteswap(True)
182 fd.write(a.tobytes())
185def readints(fd, n):
186 a = np.frombuffer(fd.read(int(n * 8)), dtype=np.int64, count=n)
187 if not np.little_endian:
188 # Cannot use in-place byteswap because frombuffer()
189 # returns readonly view
190 a = a.byteswap()
191 return a
194def file_has_fileno(fd):
195 """Tell whether file implements fileio() or not.
197 array.tofile(fd) works only on files with fileno().
198 numpy may write faster to physical files using fileno().
200 For files without fileno() we use instead fd.write(array.tobytes()).
201 Either way we need to distinguish."""
203 try:
204 fno = fd.fileno # AttributeError?
205 fno() # IOError/OSError? (Newer python: OSError is IOError)
206 except (AttributeError, IOError):
207 return False
208 return True
211class Writer:
212 def __init__(self, fd, mode='w', tag='', data=None):
213 """Create writer object.
215 fd: str
216 Filename.
217 mode: str
218 Mode. Must be 'w' for writing to a new file (overwriting an
219 existing one) and 'a' for appending to an existing file.
220 tag: str
221 Magic ID string.
222 """
224 assert mode in 'aw'
226 # Header to be written later:
227 self.header = b''
229 if data is None:
230 if np.little_endian:
231 data = {}
232 else:
233 data = {'_little_endian': False}
235 if isinstance(fd, str):
236 fd = Path(fd)
238 if mode == 'w' or (isinstance(fd, Path) and
239 not (fd.is_file() and
240 fd.stat().st_size > 0)):
241 self.nitems = 0
242 self.pos0 = 48
243 self.offsets = np.array([-1], np.int64)
245 if isinstance(fd, Path):
246 fd = fd.open('wb')
248 # File format identifier and other stuff:
249 a = np.array([VERSION, self.nitems, self.pos0], np.int64)
250 if not np.little_endian:
251 a.byteswap(True)
252 self.header = ('- of Ulm{0:16}'.format(tag).encode('ascii') +
253 a.tobytes() +
254 self.offsets.tobytes())
255 else:
256 if isinstance(fd, Path):
257 fd = fd.open('r+b')
259 version, self.nitems, self.pos0, offsets = read_header(fd)[1:]
260 assert version == VERSION
261 n = 1
262 while self.nitems > n:
263 n *= N1
264 padding = np.zeros(n - self.nitems, np.int64)
265 self.offsets = np.concatenate((offsets, padding))
266 fd.seek(0, 2)
268 self.fd = fd
269 self.hasfileno = file_has_fileno(fd)
271 self.data = data
273 # date for array being filled:
274 self.nmissing = 0 # number of missing numbers
275 self.shape = None
276 self.dtype = None
278 def __enter__(self):
279 return self
281 def __exit__(self, exc_type, exc_value, tb):
282 self.close()
284 def add_array(self, name, shape, dtype=float):
285 """Add ndarray object.
287 Set name, shape and dtype for array and fill in the data in chunks
288 later with the fill() method.
289 """
291 self._write_header()
293 if isinstance(shape, int):
294 shape = (shape,)
296 shape = tuple(int(s) for s in shape) # Convert np.int64 to int
298 i = align(self.fd)
300 self.data[name + '.'] = {
301 'ndarray': (shape, np.dtype(dtype).name, i)}
303 assert self.nmissing == 0, 'last array not done'
305 self.dtype = dtype
306 self.shape = shape
307 self.nmissing = np.prod(shape)
309 def _write_header(self):
310 # We want to delay writing until there is any real data written.
311 # Some people rely on zero file size.
312 if self.header:
313 self.fd.write(self.header)
314 self.header = b''
316 def fill(self, a):
317 """Fill in ndarray chunks for array currently being written."""
318 assert a.dtype == self.dtype
319 assert a.shape[1:] == self.shape[len(self.shape) - a.ndim + 1:]
320 self.nmissing -= a.size
321 assert self.nmissing >= 0
323 if self.hasfileno:
324 a.tofile(self.fd)
325 else:
326 self.fd.write(a.tobytes())
328 def sync(self):
329 """Write data dictionary.
331 Write bool, int, float, complex and str data, shapes and
332 dtypes for ndarrays."""
334 self._write_header()
336 assert self.nmissing == 0
337 i = self.fd.tell()
338 s = encode(self.data).encode()
339 writeint(self.fd, len(s))
340 self.fd.write(s)
342 n = len(self.offsets)
343 if self.nitems >= n:
344 offsets = np.zeros(n * N1, np.int64)
345 offsets[:n] = self.offsets
346 self.pos0 = align(self.fd)
348 buf = offsets if np.little_endian else offsets.byteswap()
350 if self.hasfileno:
351 buf.tofile(self.fd)
352 else:
353 self.fd.write(buf.tobytes())
354 writeint(self.fd, self.pos0, 40)
355 self.offsets = offsets
357 self.offsets[self.nitems] = i
358 writeint(self.fd, i, self.pos0 + self.nitems * 8)
359 self.nitems += 1
360 writeint(self.fd, self.nitems, 32)
361 self.fd.flush()
362 self.fd.seek(0, 2) # end of file
363 if np.little_endian:
364 self.data = {}
365 else:
366 self.data = {'_little_endian': False}
368 def write(self, *args, **kwargs):
369 """Write data.
371 Examples::
373 writer.write('n', 7)
374 writer.write(n=7)
375 writer.write(n=7, s='abc', a=np.zeros(3), abc=obj)
377 If obj is not one of the supported data types (bool, int, float,
378 complex, tupl, list, dict, None or ndarray) then it must have a
379 obj.write(childwriter) method.
380 """
382 if args:
383 name, value = args
384 kwargs[name] = value
386 self._write_header()
388 for name, value in kwargs.items():
389 if isinstance(value, (bool, int, float, complex,
390 dict, list, tuple, str,
391 type(None))):
392 self.data[name] = value
393 elif hasattr(value, '__array__'):
394 value = np.asarray(value)
395 if value.ndim == 0:
396 self.data[name] = value.item()
397 else:
398 self.add_array(name, value.shape, value.dtype)
399 self.fill(value)
400 else:
401 value.write(self.child(name))
403 def child(self, name):
404 """Create child-writer object."""
405 self._write_header()
406 dct = self.data[name + '.'] = {}
407 return Writer(self.fd, data=dct)
409 def close(self):
410 """Close file."""
411 n = int('_little_endian' in self.data)
412 if len(self.data) > n:
413 # There is more than the "_little_endian" key.
414 # Write that stuff before closing:
415 self.sync()
416 else:
417 # Make sure header has been written (empty ulm-file):
418 self._write_header()
419 self.fd.close()
421 def __len__(self):
422 return int(self.nitems)
425class DummyWriter:
426 def __enter__(self):
427 return self
429 def __exit__(self, exc_type, exc_value, tb):
430 self.close()
432 def add_array(self, name, shape, dtype=float):
433 pass
435 def fill(self, a):
436 pass
438 def sync(self):
439 pass
441 def write(self, *args, **kwargs):
442 pass
444 def child(self, name):
445 return self
447 def close(self):
448 pass
450 def __len__(self):
451 return 0
454def read_header(fd):
455 fd.seek(0)
456 if fd.read(8) not in [b'- of Ulm', b'AFFormat']:
457 raise InvalidULMFileError('This is not an ULM formatted file.')
458 tag = fd.read(16).decode('ascii').rstrip()
459 version, nitems, pos0 = readints(fd, 3)
460 fd.seek(pos0)
461 offsets = readints(fd, nitems)
462 return tag, version, nitems, pos0, offsets
465class InvalidULMFileError(IOError):
466 pass
469class Reader:
470 def __init__(self, fd, index=0, data=None, _little_endian=None):
471 """Create reader."""
473 self._little_endian = _little_endian
475 if not hasattr(fd, 'read'):
476 fd = Path(fd).open('rb')
478 self._fd = fd
479 self._index = index
481 if data is None:
482 (self._tag, self._version, self._nitems, self._pos0,
483 self._offsets) = read_header(fd)
484 if self._nitems > 0:
485 data = self._read_data(index)
486 else:
487 data = {}
489 self._parse_data(data)
491 def __enter__(self):
492 return self
494 def __exit__(self, exc_type, exc_value, tb):
495 self.close()
497 def _parse_data(self, data):
498 self._data = {}
499 for name, value in data.items():
500 if name.endswith('.'):
501 if 'ndarray' in value:
502 shape, dtype, offset = value['ndarray']
503 dtype = dtype.encode() # compatibility with Numpy 1.4
504 value = NDArrayReader(self._fd,
505 shape,
506 np.dtype(dtype),
507 offset,
508 self._little_endian)
509 else:
510 value = Reader(self._fd, data=value,
511 _little_endian=self._little_endian)
512 name = name[:-1]
514 self._data[name] = value
516 def get_tag(self):
517 """Return special tag string."""
518 return self._tag
520 def keys(self):
521 """Return list of keys."""
522 return self._data.keys()
524 def asdict(self):
525 """Read everything now and convert to dict."""
526 dct = {}
527 for key, value in self._data.items():
528 if isinstance(value, NDArrayReader):
529 value = value.read()
530 elif isinstance(value, Reader):
531 value = value.asdict()
532 dct[key] = value
533 return dct
535 __dir__ = keys # needed for tab-completion
537 def __getattr__(self, attr):
538 try:
539 value = self._data[attr]
540 except KeyError:
541 raise AttributeError(attr)
542 if isinstance(value, NDArrayReader):
543 return value.read()
544 return value
546 def __contains__(self, key):
547 return key in self._data
549 def __iter__(self):
550 yield self
551 for i in range(self._index + 1, self._nitems):
552 self._index = i
553 data = self._read_data(i)
554 self._parse_data(data)
555 yield self
557 def get(self, attr, value=None):
558 """Get attr or value if no such attr."""
559 try:
560 return self.__getattr__(attr)
561 except AttributeError:
562 return value
564 def proxy(self, name, *indices):
565 value = self._data[name]
566 assert isinstance(value, NDArrayReader)
567 if indices:
568 return value.proxy(*indices)
569 return value
571 def __len__(self):
572 return int(self._nitems)
574 def _read_data(self, index):
575 self._fd.seek(self._offsets[index])
576 size = int(readints(self._fd, 1)[0])
577 data = decode(self._fd.read(size).decode(), False)
578 self._little_endian = data.pop('_little_endian', True)
579 return data
581 def __getitem__(self, index):
582 """Return Reader for item *index*."""
583 data = self._read_data(index)
584 return Reader(self._fd, index, data, self._little_endian)
586 def tostr(self, verbose=False, indent=' '):
587 keys = sorted(self._data)
588 strings = []
589 for key in keys:
590 value = self._data[key]
591 if verbose and isinstance(value, NDArrayReader):
592 value = value.read()
593 if isinstance(value, NDArrayReader):
594 s = '<ndarray shape={} dtype={}>'.format(value.shape,
595 value.dtype)
596 elif isinstance(value, Reader):
597 s = value.tostr(verbose, indent + ' ')
598 else:
599 s = str(value).replace('\n', '\n ' + ' ' * len(key) + indent)
600 strings.append('{}{}: {}'.format(indent, key, s))
601 return '{\n' + ',\n'.join(strings) + '}'
603 def __str__(self):
604 return self.tostr(False, '').replace('\n', ' ')
606 def close(self):
607 self._fd.close()
610class NDArrayReader:
611 def __init__(self, fd, shape, dtype, offset, little_endian):
612 self.fd = fd
613 self.hasfileno = file_has_fileno(fd)
614 self.shape = tuple(shape)
615 self.dtype = dtype
616 self.offset = offset
617 self.little_endian = little_endian
619 self.ndim = len(self.shape)
620 self.itemsize = dtype.itemsize
621 self.size = np.prod(self.shape)
622 self.nbytes = self.size * self.itemsize
624 self.scale = 1.0
625 self.length_of_last_dimension = None
627 def __len__(self):
628 return int(self.shape[0]) # Python-2.6 needs int
630 def read(self):
631 return self[:]
633 def __getitem__(self, i):
634 if isinstance(i, numbers.Integral):
635 if i < 0:
636 i += len(self)
637 return self[i:i + 1][0]
638 start, stop, step = i.indices(len(self))
639 stride = np.prod(self.shape[1:], dtype=int)
640 offset = self.offset + start * self.itemsize * stride
641 self.fd.seek(offset)
642 count = (stop - start) * stride
643 if self.hasfileno:
644 a = np.fromfile(self.fd, self.dtype, count)
645 else:
646 # Not as fast, but works for reading from tar-files:
647 a = np.frombuffer(self.fd.read(int(count * self.itemsize)),
648 self.dtype)
649 a.shape = (stop - start,) + self.shape[1:]
650 if step != 1:
651 a = a[::step].copy()
652 if self.little_endian != np.little_endian:
653 # frombuffer() returns readonly array
654 a = a.byteswap(inplace=a.flags.writeable)
655 if self.length_of_last_dimension is not None:
656 a = a[..., :self.length_of_last_dimension]
657 if self.scale != 1.0:
658 a *= self.scale
659 return a
661 def proxy(self, *indices):
662 stride = self.size // len(self)
663 start = 0
664 for i, index in enumerate(indices):
665 start += stride * index
666 stride //= self.shape[i + 1]
667 offset = self.offset + start * self.itemsize
668 p = NDArrayReader(self.fd, self.shape[i + 1:], self.dtype,
669 offset, self.little_endian)
670 p.scale = self.scale
671 return p
674def print_ulm_info(filename, index=None, verbose=False):
675 b = ulmopen(filename, 'r')
676 if index is None:
677 indices = range(len(b))
678 else:
679 indices = [index]
680 print('{0} (tag: "{1}", {2})'.format(filename, b.get_tag(),
681 plural(len(b), 'item')))
682 for i in indices:
683 print('item #{0}:'.format(i))
684 print(b[i].tostr(verbose))
687def copy(reader: Union[str, Path, Reader],
688 writer: Union[str, Path, Writer],
689 exclude: Set[str] = set(),
690 name: str = '') -> None:
691 """Copy from reader to writer except for keys in exclude."""
692 close_reader = False
693 close_writer = False
694 if not isinstance(reader, Reader):
695 reader = Reader(reader)
696 close_reader = True
697 if not isinstance(writer, Writer):
698 writer = Writer(writer)
699 close_writer = True
700 for key, value in reader._data.items():
701 if name + '.' + key in exclude:
702 continue
703 if isinstance(value, NDArrayReader):
704 value = value.read()
705 if isinstance(value, Reader):
706 copy(value, writer.child(key), exclude, name + '.' + key)
707 else:
708 writer.write(key, value)
709 if close_reader:
710 reader.close()
711 if close_writer:
712 writer.close()