Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2ULM files 

3========= 

4 

5*Simple and efficient pythonic file-format* 

6 

7Stores ndarrays as binary data and Python's built-in datatypes 

8(bool, int, float, complex, str, dict, list, tuple, None) as json. 

9 

10.. autofunction:: open 

11.. autoexception:: InvalidULMFileError 

12 

13 

14File layout 

15----------- 

16 

17When there is only a single item:: 

18 

19 0: "- of Ulm" (magic prefix, ascii) 

20 8: " " (tag, ascii) 

21 24: version (int64) 

22 32: nitems (int64) 

23 40: 48 (position of offsets, int64) 

24 48: p0 (offset to json data, int64) 

25 56: array1, array2, ... (8-byte aligned ndarrays) 

26 p0: n (length of json data, int64) 

27 p0+8: json data 

28 p0+8+n: EOF 

29 

30 

31Examples 

32-------- 

33 

34Writing: 

35 

36>>> import numpy as np 

37>>> import ase.io.ulm as ulm 

38>>> with ulm.open('x.ulm', 'w') as w: 

39... w.write(a=np.ones(7), b=42, c='abc') 

40... w.write(d=3.14) 

41 

42 

43Reading: 

44 

45>>> r = ulm.open('x.ulm') 

46>>> print(r.c) 

47abc 

48>>> r.close() 

49 

50To see what's inside 'x.ulm' do this:: 

51 

52 $ ase ulm x.ulm 

53 x.ulm (tag: "", 1 item) 

54 item #0: 

55 { 

56 a: <ndarray shape=(7,) dtype=float64>, 

57 b: 42, 

58 c: abc, 

59 d: 3.14} 

60 

61 

62.. autoclass:: Writer 

63 :members: 

64 

65.. autoclass:: Reader 

66 :members: 

67 

68 

69More examples 

70------------- 

71 

72In the following we append to the ulm-file from above and demonstrae 

73how to write a big array in chunks: 

74 

75>>> w = ulm.open('x.ulm', 'a') 

76>>> w.add_array('bigarray', (10, 1000), float) 

77>>> for i in range(10): 

78... w.fill(np.ones(1000)) 

79... 

80>>> w.close() 

81 

82Now read first and second items: 

83 

84>>> with ulm.open('x.ulm') as r: 

85... print(r.keys()) 

86dict_keys(['a', 'b', 'c', 'd']) 

87>>> with ulm.open('x.ulm', index=1) as r: 

88... print(r.keys()) 

89dict_keys(['bigarray']) 

90 

91To get all the data, it is possible to iterate over the items in the file. 

92 

93>>> for i, r in enumerate(ulm.Reader('x.ulm')): 

94... for k in r.keys(): 

95... print(i, k) 

960 a 

970 b 

980 c 

990 d 

1001 bigarray 

101>>> r.close() 

102 

103The different parts (items) of the file are numbered by the index 

104argument: 

105 

106>>> r = ulm.Reader('x.ulm') 

107>>> r[1].bigarray.shape 

108(10, 1000) 

109>>> r.close() 

110 

111 

112Versions 

113-------- 

114 

1151) Initial version. 

116 

1172) Added support for big endian machines. Json data may now have 

118 _little_endian=False item. 

119 

1203) Changed magic string from "AFFormat" to "- of Ulm". 

121""" 

122 

123import numbers 

124from pathlib import Path 

125from typing import Union, Set 

126 

127import numpy as np 

128 

129from ase.io.jsonio import encode, decode 

130from ase.utils import plural 

131 

132 

133VERSION = 3 

134N1 = 42 # block size - max number of items: 1, N1, N1*N1, N1*N1*N1, ... 

135 

136 

137def open(filename, mode='r', index=None, tag=None): 

138 """Open ulm-file. 

139 

140 filename: str 

141 Filename. 

142 mode: str 

143 Mode. Must be 'r' for reading, 'w' for writing to a new file 

144 (overwriting an existing one) or 'a' for appending to an existing file. 

145 index: int 

146 Index of item to read. Defaults to 0. 

147 tag: str 

148 Magic ID string. 

149 

150 Returns a :class:`Reader` or a :class:`Writer` object. May raise 

151 :class:`InvalidULMFileError`. 

152 """ 

153 if mode == 'r': 

154 assert tag is None 

155 return Reader(filename, index or 0) 

156 if mode not in 'wa': 

157 2 / 0 

158 assert index is None 

159 return Writer(filename, mode, tag or '') 

160 

161 

162ulmopen = open 

163 

164 

165def align(fd): 

166 """Advance file descriptor to 8 byte alignment and return position.""" 

167 pos = fd.tell() 

168 r = pos % 8 

169 if r == 0: 

170 return pos 

171 fd.write(b'#' * (8 - r)) 

172 return pos + 8 - r 

173 

174 

175def writeint(fd, n, pos=None): 

176 """Write 64 bit integer n at pos or current position.""" 

177 if pos is not None: 

178 fd.seek(pos) 

179 a = np.array(n, np.int64) 

180 if not np.little_endian: 

181 a.byteswap(True) 

182 fd.write(a.tobytes()) 

183 

184 

185def readints(fd, n): 

186 a = np.frombuffer(fd.read(int(n * 8)), dtype=np.int64, count=n) 

187 if not np.little_endian: 

188 # Cannot use in-place byteswap because frombuffer() 

189 # returns readonly view 

190 a = a.byteswap() 

191 return a 

192 

193 

194def file_has_fileno(fd): 

195 """Tell whether file implements fileio() or not. 

196 

197 array.tofile(fd) works only on files with fileno(). 

198 numpy may write faster to physical files using fileno(). 

199 

200 For files without fileno() we use instead fd.write(array.tobytes()). 

201 Either way we need to distinguish.""" 

202 

203 try: 

204 fno = fd.fileno # AttributeError? 

205 fno() # IOError/OSError? (Newer python: OSError is IOError) 

206 except (AttributeError, IOError): 

207 return False 

208 return True 

209 

210 

211class Writer: 

212 def __init__(self, fd, mode='w', tag='', data=None): 

213 """Create writer object. 

214 

215 fd: str 

216 Filename. 

217 mode: str 

218 Mode. Must be 'w' for writing to a new file (overwriting an 

219 existing one) and 'a' for appending to an existing file. 

220 tag: str 

221 Magic ID string. 

222 """ 

223 

224 assert mode in 'aw' 

225 

226 # Header to be written later: 

227 self.header = b'' 

228 

229 if data is None: 

230 if np.little_endian: 

231 data = {} 

232 else: 

233 data = {'_little_endian': False} 

234 

235 if isinstance(fd, str): 

236 fd = Path(fd) 

237 

238 if mode == 'w' or (isinstance(fd, Path) and 

239 not (fd.is_file() and 

240 fd.stat().st_size > 0)): 

241 self.nitems = 0 

242 self.pos0 = 48 

243 self.offsets = np.array([-1], np.int64) 

244 

245 if isinstance(fd, Path): 

246 fd = fd.open('wb') 

247 

248 # File format identifier and other stuff: 

249 a = np.array([VERSION, self.nitems, self.pos0], np.int64) 

250 if not np.little_endian: 

251 a.byteswap(True) 

252 self.header = ('- of Ulm{0:16}'.format(tag).encode('ascii') + 

253 a.tobytes() + 

254 self.offsets.tobytes()) 

255 else: 

256 if isinstance(fd, Path): 

257 fd = fd.open('r+b') 

258 

259 version, self.nitems, self.pos0, offsets = read_header(fd)[1:] 

260 assert version == VERSION 

261 n = 1 

262 while self.nitems > n: 

263 n *= N1 

264 padding = np.zeros(n - self.nitems, np.int64) 

265 self.offsets = np.concatenate((offsets, padding)) 

266 fd.seek(0, 2) 

267 

268 self.fd = fd 

269 self.hasfileno = file_has_fileno(fd) 

270 

271 self.data = data 

272 

273 # date for array being filled: 

274 self.nmissing = 0 # number of missing numbers 

275 self.shape = None 

276 self.dtype = None 

277 

278 def __enter__(self): 

279 return self 

280 

281 def __exit__(self, exc_type, exc_value, tb): 

282 self.close() 

283 

284 def add_array(self, name, shape, dtype=float): 

285 """Add ndarray object. 

286 

287 Set name, shape and dtype for array and fill in the data in chunks 

288 later with the fill() method. 

289 """ 

290 

291 self._write_header() 

292 

293 if isinstance(shape, int): 

294 shape = (shape,) 

295 

296 shape = tuple(int(s) for s in shape) # Convert np.int64 to int 

297 

298 i = align(self.fd) 

299 

300 self.data[name + '.'] = { 

301 'ndarray': (shape, np.dtype(dtype).name, i)} 

302 

303 assert self.nmissing == 0, 'last array not done' 

304 

305 self.dtype = dtype 

306 self.shape = shape 

307 self.nmissing = np.prod(shape) 

308 

309 def _write_header(self): 

310 # We want to delay writing until there is any real data written. 

311 # Some people rely on zero file size. 

312 if self.header: 

313 self.fd.write(self.header) 

314 self.header = b'' 

315 

316 def fill(self, a): 

317 """Fill in ndarray chunks for array currently being written.""" 

318 assert a.dtype == self.dtype 

319 assert a.shape[1:] == self.shape[len(self.shape) - a.ndim + 1:] 

320 self.nmissing -= a.size 

321 assert self.nmissing >= 0 

322 

323 if self.hasfileno: 

324 a.tofile(self.fd) 

325 else: 

326 self.fd.write(a.tobytes()) 

327 

328 def sync(self): 

329 """Write data dictionary. 

330 

331 Write bool, int, float, complex and str data, shapes and 

332 dtypes for ndarrays.""" 

333 

334 self._write_header() 

335 

336 assert self.nmissing == 0 

337 i = self.fd.tell() 

338 s = encode(self.data).encode() 

339 writeint(self.fd, len(s)) 

340 self.fd.write(s) 

341 

342 n = len(self.offsets) 

343 if self.nitems >= n: 

344 offsets = np.zeros(n * N1, np.int64) 

345 offsets[:n] = self.offsets 

346 self.pos0 = align(self.fd) 

347 

348 buf = offsets if np.little_endian else offsets.byteswap() 

349 

350 if self.hasfileno: 

351 buf.tofile(self.fd) 

352 else: 

353 self.fd.write(buf.tobytes()) 

354 writeint(self.fd, self.pos0, 40) 

355 self.offsets = offsets 

356 

357 self.offsets[self.nitems] = i 

358 writeint(self.fd, i, self.pos0 + self.nitems * 8) 

359 self.nitems += 1 

360 writeint(self.fd, self.nitems, 32) 

361 self.fd.flush() 

362 self.fd.seek(0, 2) # end of file 

363 if np.little_endian: 

364 self.data = {} 

365 else: 

366 self.data = {'_little_endian': False} 

367 

368 def write(self, *args, **kwargs): 

369 """Write data. 

370 

371 Examples:: 

372 

373 writer.write('n', 7) 

374 writer.write(n=7) 

375 writer.write(n=7, s='abc', a=np.zeros(3), abc=obj) 

376 

377 If obj is not one of the supported data types (bool, int, float, 

378 complex, tupl, list, dict, None or ndarray) then it must have a 

379 obj.write(childwriter) method. 

380 """ 

381 

382 if args: 

383 name, value = args 

384 kwargs[name] = value 

385 

386 self._write_header() 

387 

388 for name, value in kwargs.items(): 

389 if isinstance(value, (bool, int, float, complex, 

390 dict, list, tuple, str, 

391 type(None))): 

392 self.data[name] = value 

393 elif hasattr(value, '__array__'): 

394 value = np.asarray(value) 

395 if value.ndim == 0: 

396 self.data[name] = value.item() 

397 else: 

398 self.add_array(name, value.shape, value.dtype) 

399 self.fill(value) 

400 else: 

401 value.write(self.child(name)) 

402 

403 def child(self, name): 

404 """Create child-writer object.""" 

405 self._write_header() 

406 dct = self.data[name + '.'] = {} 

407 return Writer(self.fd, data=dct) 

408 

409 def close(self): 

410 """Close file.""" 

411 n = int('_little_endian' in self.data) 

412 if len(self.data) > n: 

413 # There is more than the "_little_endian" key. 

414 # Write that stuff before closing: 

415 self.sync() 

416 else: 

417 # Make sure header has been written (empty ulm-file): 

418 self._write_header() 

419 self.fd.close() 

420 

421 def __len__(self): 

422 return int(self.nitems) 

423 

424 

425class DummyWriter: 

426 def __enter__(self): 

427 return self 

428 

429 def __exit__(self, exc_type, exc_value, tb): 

430 self.close() 

431 

432 def add_array(self, name, shape, dtype=float): 

433 pass 

434 

435 def fill(self, a): 

436 pass 

437 

438 def sync(self): 

439 pass 

440 

441 def write(self, *args, **kwargs): 

442 pass 

443 

444 def child(self, name): 

445 return self 

446 

447 def close(self): 

448 pass 

449 

450 def __len__(self): 

451 return 0 

452 

453 

454def read_header(fd): 

455 fd.seek(0) 

456 if fd.read(8) not in [b'- of Ulm', b'AFFormat']: 

457 raise InvalidULMFileError('This is not an ULM formatted file.') 

458 tag = fd.read(16).decode('ascii').rstrip() 

459 version, nitems, pos0 = readints(fd, 3) 

460 fd.seek(pos0) 

461 offsets = readints(fd, nitems) 

462 return tag, version, nitems, pos0, offsets 

463 

464 

465class InvalidULMFileError(IOError): 

466 pass 

467 

468 

469class Reader: 

470 def __init__(self, fd, index=0, data=None, _little_endian=None): 

471 """Create reader.""" 

472 

473 self._little_endian = _little_endian 

474 

475 if not hasattr(fd, 'read'): 

476 fd = Path(fd).open('rb') 

477 

478 self._fd = fd 

479 self._index = index 

480 

481 if data is None: 

482 (self._tag, self._version, self._nitems, self._pos0, 

483 self._offsets) = read_header(fd) 

484 if self._nitems > 0: 

485 data = self._read_data(index) 

486 else: 

487 data = {} 

488 

489 self._parse_data(data) 

490 

491 def __enter__(self): 

492 return self 

493 

494 def __exit__(self, exc_type, exc_value, tb): 

495 self.close() 

496 

497 def _parse_data(self, data): 

498 self._data = {} 

499 for name, value in data.items(): 

500 if name.endswith('.'): 

501 if 'ndarray' in value: 

502 shape, dtype, offset = value['ndarray'] 

503 dtype = dtype.encode() # compatibility with Numpy 1.4 

504 value = NDArrayReader(self._fd, 

505 shape, 

506 np.dtype(dtype), 

507 offset, 

508 self._little_endian) 

509 else: 

510 value = Reader(self._fd, data=value, 

511 _little_endian=self._little_endian) 

512 name = name[:-1] 

513 

514 self._data[name] = value 

515 

516 def get_tag(self): 

517 """Return special tag string.""" 

518 return self._tag 

519 

520 def keys(self): 

521 """Return list of keys.""" 

522 return self._data.keys() 

523 

524 def asdict(self): 

525 """Read everything now and convert to dict.""" 

526 dct = {} 

527 for key, value in self._data.items(): 

528 if isinstance(value, NDArrayReader): 

529 value = value.read() 

530 elif isinstance(value, Reader): 

531 value = value.asdict() 

532 dct[key] = value 

533 return dct 

534 

535 __dir__ = keys # needed for tab-completion 

536 

537 def __getattr__(self, attr): 

538 try: 

539 value = self._data[attr] 

540 except KeyError: 

541 raise AttributeError(attr) 

542 if isinstance(value, NDArrayReader): 

543 return value.read() 

544 return value 

545 

546 def __contains__(self, key): 

547 return key in self._data 

548 

549 def __iter__(self): 

550 yield self 

551 for i in range(self._index + 1, self._nitems): 

552 self._index = i 

553 data = self._read_data(i) 

554 self._parse_data(data) 

555 yield self 

556 

557 def get(self, attr, value=None): 

558 """Get attr or value if no such attr.""" 

559 try: 

560 return self.__getattr__(attr) 

561 except AttributeError: 

562 return value 

563 

564 def proxy(self, name, *indices): 

565 value = self._data[name] 

566 assert isinstance(value, NDArrayReader) 

567 if indices: 

568 return value.proxy(*indices) 

569 return value 

570 

571 def __len__(self): 

572 return int(self._nitems) 

573 

574 def _read_data(self, index): 

575 self._fd.seek(self._offsets[index]) 

576 size = int(readints(self._fd, 1)[0]) 

577 data = decode(self._fd.read(size).decode(), False) 

578 self._little_endian = data.pop('_little_endian', True) 

579 return data 

580 

581 def __getitem__(self, index): 

582 """Return Reader for item *index*.""" 

583 data = self._read_data(index) 

584 return Reader(self._fd, index, data, self._little_endian) 

585 

586 def tostr(self, verbose=False, indent=' '): 

587 keys = sorted(self._data) 

588 strings = [] 

589 for key in keys: 

590 value = self._data[key] 

591 if verbose and isinstance(value, NDArrayReader): 

592 value = value.read() 

593 if isinstance(value, NDArrayReader): 

594 s = '<ndarray shape={} dtype={}>'.format(value.shape, 

595 value.dtype) 

596 elif isinstance(value, Reader): 

597 s = value.tostr(verbose, indent + ' ') 

598 else: 

599 s = str(value).replace('\n', '\n ' + ' ' * len(key) + indent) 

600 strings.append('{}{}: {}'.format(indent, key, s)) 

601 return '{\n' + ',\n'.join(strings) + '}' 

602 

603 def __str__(self): 

604 return self.tostr(False, '').replace('\n', ' ') 

605 

606 def close(self): 

607 self._fd.close() 

608 

609 

610class NDArrayReader: 

611 def __init__(self, fd, shape, dtype, offset, little_endian): 

612 self.fd = fd 

613 self.hasfileno = file_has_fileno(fd) 

614 self.shape = tuple(shape) 

615 self.dtype = dtype 

616 self.offset = offset 

617 self.little_endian = little_endian 

618 

619 self.ndim = len(self.shape) 

620 self.itemsize = dtype.itemsize 

621 self.size = np.prod(self.shape) 

622 self.nbytes = self.size * self.itemsize 

623 

624 self.scale = 1.0 

625 self.length_of_last_dimension = None 

626 

627 def __len__(self): 

628 return int(self.shape[0]) # Python-2.6 needs int 

629 

630 def read(self): 

631 return self[:] 

632 

633 def __getitem__(self, i): 

634 if isinstance(i, numbers.Integral): 

635 if i < 0: 

636 i += len(self) 

637 return self[i:i + 1][0] 

638 start, stop, step = i.indices(len(self)) 

639 stride = np.prod(self.shape[1:], dtype=int) 

640 offset = self.offset + start * self.itemsize * stride 

641 self.fd.seek(offset) 

642 count = (stop - start) * stride 

643 if self.hasfileno: 

644 a = np.fromfile(self.fd, self.dtype, count) 

645 else: 

646 # Not as fast, but works for reading from tar-files: 

647 a = np.frombuffer(self.fd.read(int(count * self.itemsize)), 

648 self.dtype) 

649 a.shape = (stop - start,) + self.shape[1:] 

650 if step != 1: 

651 a = a[::step].copy() 

652 if self.little_endian != np.little_endian: 

653 # frombuffer() returns readonly array 

654 a = a.byteswap(inplace=a.flags.writeable) 

655 if self.length_of_last_dimension is not None: 

656 a = a[..., :self.length_of_last_dimension] 

657 if self.scale != 1.0: 

658 a *= self.scale 

659 return a 

660 

661 def proxy(self, *indices): 

662 stride = self.size // len(self) 

663 start = 0 

664 for i, index in enumerate(indices): 

665 start += stride * index 

666 stride //= self.shape[i + 1] 

667 offset = self.offset + start * self.itemsize 

668 p = NDArrayReader(self.fd, self.shape[i + 1:], self.dtype, 

669 offset, self.little_endian) 

670 p.scale = self.scale 

671 return p 

672 

673 

674def print_ulm_info(filename, index=None, verbose=False): 

675 b = ulmopen(filename, 'r') 

676 if index is None: 

677 indices = range(len(b)) 

678 else: 

679 indices = [index] 

680 print('{0} (tag: "{1}", {2})'.format(filename, b.get_tag(), 

681 plural(len(b), 'item'))) 

682 for i in indices: 

683 print('item #{0}:'.format(i)) 

684 print(b[i].tostr(verbose)) 

685 

686 

687def copy(reader: Union[str, Path, Reader], 

688 writer: Union[str, Path, Writer], 

689 exclude: Set[str] = set(), 

690 name: str = '') -> None: 

691 """Copy from reader to writer except for keys in exclude.""" 

692 close_reader = False 

693 close_writer = False 

694 if not isinstance(reader, Reader): 

695 reader = Reader(reader) 

696 close_reader = True 

697 if not isinstance(writer, Writer): 

698 writer = Writer(writer) 

699 close_writer = True 

700 for key, value in reader._data.items(): 

701 if name + '.' + key in exclude: 

702 continue 

703 if isinstance(value, NDArrayReader): 

704 value = value.read() 

705 if isinstance(value, Reader): 

706 copy(value, writer.child(key), exclude, name + '.' + key) 

707 else: 

708 writer.write(key, value) 

709 if close_reader: 

710 reader.close() 

711 if close_writer: 

712 writer.close()