ipld-eth-server/vendor/github.com/ipfs/go-unixfs/io/dagreader.go

492 lines
15 KiB
Go

package io
import (
"bytes"
"context"
"errors"
"io"
ipld "github.com/ipfs/go-ipld-format"
mdag "github.com/ipfs/go-merkledag"
unixfs "github.com/ipfs/go-unixfs"
)
// Common errors
var (
ErrIsDir = errors.New("this dag node is a directory")
ErrCantReadSymlinks = errors.New("cannot currently read symlinks")
ErrUnkownNodeType = errors.New("unknown node type")
)
// TODO: Rename the `DagReader` interface, this doesn't read *any* DAG, just
// DAGs with UnixFS node (and it *belongs* to the `unixfs` package). Some
// alternatives: `FileReader`, `UnixFSFileReader`, `UnixFSReader`.
// A DagReader provides read-only read and seek acess to a unixfs file.
// Different implementations of readers are used for the different
// types of unixfs/protobuf-encoded nodes.
type DagReader interface {
ReadSeekCloser
Size() uint64
CtxReadFull(context.Context, []byte) (int, error)
}
// A ReadSeekCloser implements interfaces to read, copy, seek and close.
type ReadSeekCloser interface {
io.Reader
io.Seeker
io.Closer
io.WriterTo
}
// NewDagReader creates a new reader object that reads the data represented by
// the given node, using the passed in DAGService for data retrieval.
func NewDagReader(ctx context.Context, n ipld.Node, serv ipld.NodeGetter) (DagReader, error) {
var size uint64
switch n := n.(type) {
case *mdag.RawNode:
size = uint64(len(n.RawData()))
case *mdag.ProtoNode:
fsNode, err := unixfs.FSNodeFromBytes(n.Data())
if err != nil {
return nil, err
}
switch fsNode.Type() {
case unixfs.TFile, unixfs.TRaw:
size = fsNode.FileSize()
case unixfs.TDirectory, unixfs.THAMTShard:
// Dont allow reading directories
return nil, ErrIsDir
case unixfs.TMetadata:
if len(n.Links()) == 0 {
return nil, errors.New("incorrectly formatted metadata object")
}
child, err := n.Links()[0].GetNode(ctx, serv)
if err != nil {
return nil, err
}
childpb, ok := child.(*mdag.ProtoNode)
if !ok {
return nil, mdag.ErrNotProtobuf
}
return NewDagReader(ctx, childpb, serv)
case unixfs.TSymlink:
return nil, ErrCantReadSymlinks
default:
return nil, unixfs.ErrUnrecognizedType
}
default:
return nil, ErrUnkownNodeType
}
ctxWithCancel, cancel := context.WithCancel(ctx)
return &dagReader{
ctx: ctxWithCancel,
cancel: cancel,
serv: serv,
size: size,
rootNode: n,
dagWalker: ipld.NewWalker(ctxWithCancel, ipld.NewNavigableIPLDNode(n, serv)),
}, nil
}
// dagReader provides a way to easily read the data contained in a dag.
type dagReader struct {
// Structure to perform the DAG iteration and search, the reader
// just needs to add logic to the `Visitor` callback passed to
// `Iterate` and `Seek`.
dagWalker *ipld.Walker
// Buffer with the data extracted from the current node being visited.
// To avoid revisiting a node to complete a (potential) partial read
// (or read after seek) the node's data is fully extracted in a single
// `readNodeDataBuffer` operation.
currentNodeData *bytes.Reader
// Implements the `Size()` API.
size uint64
// Current offset for the read head within the DAG file.
offset int64
// Root node of the DAG, stored to re-create the `dagWalker` (effectively
// re-setting the position of the reader, used during `Seek`).
rootNode ipld.Node
// Context passed to the `dagWalker`, the `cancel` function is used to
// cancel read operations (cancelling requested child node promises,
// see `ipld.NavigableIPLDNode.FetchChild` for details).
ctx context.Context
cancel func()
// Passed to the `dagWalker` that will use it to request nodes.
// TODO: Revisit name.
serv ipld.NodeGetter
}
// Size returns the total size of the data from the DAG structured file.
func (dr *dagReader) Size() uint64 {
return dr.size
}
// Read implements the `io.Reader` interface through the `CtxReadFull`
// method using the DAG reader's internal context.
func (dr *dagReader) Read(b []byte) (int, error) {
return dr.CtxReadFull(dr.ctx, b)
}
// CtxReadFull reads data from the DAG structured file. It always
// attempts a full read of the DAG until the `out` buffer is full.
// It uses the `Walker` structure to iterate the file DAG and read
// every node's data into the `out` buffer.
func (dr *dagReader) CtxReadFull(ctx context.Context, out []byte) (n int, err error) {
// Set the `dagWalker`'s context to the `ctx` argument, it will be used
// to fetch the child node promises (see
// `ipld.NavigableIPLDNode.FetchChild` for details).
dr.dagWalker.SetContext(ctx)
// If there was a partially read buffer from the last visited
// node read it before visiting a new one.
if dr.currentNodeData != nil {
// TODO: Move this check inside `readNodeDataBuffer`?
n = dr.readNodeDataBuffer(out)
if n == len(out) {
return n, nil
// Output buffer full, no need to traverse the DAG.
}
}
// Iterate the DAG calling the passed `Visitor` function on every node
// to read its data into the `out` buffer, stop if there is an error or
// if the entire DAG is traversed (`EndOfDag`).
err = dr.dagWalker.Iterate(func(visitedNode ipld.NavigableNode) error {
node := ipld.ExtractIPLDNode(visitedNode)
// Skip internal nodes, they shouldn't have any file data
// (see the `balanced` package for more details).
if len(node.Links()) > 0 {
return nil
}
err = dr.saveNodeData(node)
if err != nil {
return err
}
// Save the leaf node file data in a buffer in case it is only
// partially read now and future `CtxReadFull` calls reclaim the
// rest (as each node is visited only once during `Iterate`).
//
// TODO: We could check if the entire node's data can fit in the
// remaining `out` buffer free space to skip this intermediary step.
n += dr.readNodeDataBuffer(out[n:])
if n == len(out) {
// Output buffer full, no need to keep traversing the DAG,
// signal the `Walker` to pause the iteration.
dr.dagWalker.Pause()
}
return nil
})
if err == ipld.EndOfDag {
return n, io.EOF
// Reached the end of the (DAG) file, no more data to read.
} else if err != nil {
return n, err
// Pass along any other errors from the `Visitor`.
}
return n, nil
}
// Save the UnixFS `node`'s data into the internal `currentNodeData` buffer to
// later move it to the output buffer (`Read`) or seek into it (`Seek`).
func (dr *dagReader) saveNodeData(node ipld.Node) error {
extractedNodeData, err := unixfs.ReadUnixFSNodeData(node)
if err != nil {
return err
}
dr.currentNodeData = bytes.NewReader(extractedNodeData)
return nil
}
// Read the `currentNodeData` buffer into `out`. This function can't have
// any errors as it's always reading from a `bytes.Reader` and asking only
// the available data in it.
func (dr *dagReader) readNodeDataBuffer(out []byte) int {
n, _ := dr.currentNodeData.Read(out)
// Ignore the error as the EOF may not be returned in the first
// `Read` call, explicitly ask for an empty buffer below to check
// if we've reached the end.
if dr.currentNodeData.Len() == 0 {
dr.currentNodeData = nil
// Signal that the buffer was consumed (for later `Read` calls).
// This shouldn't return an EOF error as it's just the end of a
// single node's data, not the entire DAG.
}
dr.offset += int64(n)
// TODO: Should `offset` be incremented here or in the calling function?
// (Doing it here saves LoC but may be confusing as it's more hidden).
return n
}
// Similar to `readNodeDataBuffer` but it writes the contents to
// an `io.Writer` argument.
//
// TODO: Check what part of the logic between the two functions
// can be extracted away.
func (dr *dagReader) writeNodeDataBuffer(w io.Writer) (int64, error) {
n, err := dr.currentNodeData.WriteTo(w)
if err != nil {
return n, err
}
if dr.currentNodeData.Len() == 0 {
dr.currentNodeData = nil
// Signal that the buffer was consumed (for later `Read` calls).
// This shouldn't return an EOF error as it's just the end of a
// single node's data, not the entire DAG.
}
dr.offset += int64(n)
return n, nil
}
// WriteTo writes to the given writer.
// This follows the `bytes.Reader.WriteTo` implementation
// where it starts from the internal index that may have
// been modified by other `Read` calls.
//
// TODO: This implementation is very similar to `CtxReadFull`,
// the common parts should be abstracted away.
func (dr *dagReader) WriteTo(w io.Writer) (n int64, err error) {
// Use the internal reader's context to fetch the child node promises
// (see `ipld.NavigableIPLDNode.FetchChild` for details).
dr.dagWalker.SetContext(dr.ctx)
// If there was a partially read buffer from the last visited
// node read it before visiting a new one.
if dr.currentNodeData != nil {
n, err = dr.writeNodeDataBuffer(w)
if err != nil {
return n, err
}
}
// Iterate the DAG calling the passed `Visitor` function on every node
// to read its data into the `out` buffer, stop if there is an error or
// if the entire DAG is traversed (`EndOfDag`).
err = dr.dagWalker.Iterate(func(visitedNode ipld.NavigableNode) error {
node := ipld.ExtractIPLDNode(visitedNode)
// Skip internal nodes, they shouldn't have any file data
// (see the `balanced` package for more details).
if len(node.Links()) > 0 {
return nil
}
err = dr.saveNodeData(node)
if err != nil {
return err
}
// Save the leaf node file data in a buffer in case it is only
// partially read now and future `CtxReadFull` calls reclaim the
// rest (as each node is visited only once during `Iterate`).
written, err := dr.writeNodeDataBuffer(w)
n += written
if err != nil {
return err
}
return nil
})
if err == ipld.EndOfDag {
return n, nil
}
return n, err
}
// Close the reader (cancelling fetch node operations requested with
// the internal context, that is, `Read` calls but not `CtxReadFull`
// with user-supplied contexts).
func (dr *dagReader) Close() error {
dr.cancel()
return nil
}
// Seek implements `io.Seeker` seeking to a given offset in the DAG file,
// it matches the standard unix `seek`. It moves the position of the internal
// `dagWalker` and may also leave a `currentNodeData` buffer loaded in case
// the seek is performed to the middle of the data in a node.
//
// TODO: Support seeking from the current position (relative seek)
// through the `dagWalker` in `io.SeekCurrent`.
func (dr *dagReader) Seek(offset int64, whence int) (int64, error) {
switch whence {
case io.SeekStart:
if offset < 0 {
return -1, errors.New("invalid offset")
}
if offset == dr.offset {
return offset, nil
// Already at the requested `offset`, nothing to do.
}
left := offset
// Amount left to seek.
// Seek from the beginning of the DAG.
dr.resetPosition()
// Use the internal reader's context to fetch the child node promises
// (see `ipld.NavigableIPLDNode.FetchChild` for details).
dr.dagWalker.SetContext(dr.ctx)
// TODO: Performance: we could adjust here `preloadSize` of
// `ipld.NavigableIPLDNode` also, when seeking we only want
// to fetch one child at a time.
// Seek the DAG by calling the provided `Visitor` function on every
// node the `dagWalker` descends to while searching which can be
// either an internal or leaf node. In the internal node case, check
// the child node sizes and set the corresponding child index to go
// down to next. In the leaf case (last visit of the search), if there
// is still an amount `left` to seek do it inside the node's data
// saved in the `currentNodeData` buffer, leaving it ready for a `Read`
// call.
err := dr.dagWalker.Seek(func(visitedNode ipld.NavigableNode) error {
node := ipld.ExtractIPLDNode(visitedNode)
if len(node.Links()) > 0 {
// Internal node, should be a `mdag.ProtoNode` containing a
// `unixfs.FSNode` (see the `balanced` package for more details).
fsNode, err := unixfs.ExtractFSNode(node)
if err != nil {
return err
}
// If there aren't enough size hints don't seek
// (see the `io.EOF` handling error comment below).
if fsNode.NumChildren() != len(node.Links()) {
return io.EOF
}
// Internal nodes have no data, so just iterate through the
// sizes of its children (advancing the child index of the
// `dagWalker`) to find where we need to go down to next in
// the search.
for {
childSize := fsNode.BlockSize(int(dr.dagWalker.ActiveChildIndex()))
if childSize > uint64(left) {
// This child's data contains the position requested
// in `offset`, go down this child.
return nil
}
// Else, skip this child.
left -= int64(childSize)
err := dr.dagWalker.NextChild()
if err == ipld.ErrNextNoChild {
// No more child nodes available, nothing to do,
// the `Seek` will stop on its own.
return nil
} else if err != nil {
return err
// Pass along any other errors (that may in future
// implementations be returned by `Next`) to stop
// the search.
}
}
} else {
// Leaf node, seek inside its data.
err := dr.saveNodeData(node)
if err != nil {
return err
}
_, err = dr.currentNodeData.Seek(left, io.SeekStart)
if err != nil {
return err
}
// The corner case of a DAG consisting only of a single (leaf)
// node should make no difference here. In that case, where the
// node doesn't have a parent UnixFS node with size hints, this
// implementation would allow this `Seek` to be called with an
// argument larger than the buffer size which normally wouldn't
// happen (because we would skip the node based on the size
// hint) but that would just mean that a future `CtxReadFull`
// call would read no data from the `currentNodeData` buffer.
// TODO: Re-check this reasoning.
return nil
// In the leaf node case the search will stop here.
}
})
if err == io.EOF {
// TODO: Taken from https://github.com/ipfs/go-ipfs/pull/4320,
// check if still valid.
// Return negative number if we can't figure out the file size. Using io.EOF
// for this seems to be good(-enough) solution as it's only returned by
// precalcNextBuf when we step out of file range.
// This is needed for gateway to function properly
return -1, nil
}
if err != nil {
return 0, err
}
dr.offset = offset
return dr.offset, nil
case io.SeekCurrent:
if offset == 0 {
return dr.offset, nil
}
return dr.Seek(dr.offset+offset, io.SeekStart)
// TODO: Performance. This can be improved supporting relative
// searches in the `Walker` (see `Walker.Seek`).
case io.SeekEnd:
return dr.Seek(int64(dr.Size())+offset, io.SeekStart)
default:
return 0, errors.New("invalid whence")
}
}
// Reset the reader position by resetting the `dagWalker` and discarding
// any partially used node's data in the `currentNodeData` buffer, used
// in the `SeekStart` case.
func (dr *dagReader) resetPosition() {
dr.currentNodeData = nil
dr.dagWalker = ipld.NewWalker(dr.ctx, ipld.NewNavigableIPLDNode(dr.rootNode, dr.serv))
// TODO: This could be avoided (along with storing the `dr.rootNode` and
// `dr.serv` just for this call) if `Reset` is supported in the `Walker`.
}