/*
 * Copyright 1998-2001, University of Notre Dame.
 * Authors: Jeffrey M. Squyres, Arun Rodrigues, and Brian Barrett with
 *          Kinis L. Meyer, M. D. McNally, and Andrew Lumsdaine
 * 
 * This file is part of the Notre Dame LAM implementation of MPI.
 * 
 * You should have received a copy of the License Agreement for the Notre
 * Dame LAM implementation of MPI along with the software; see the file
 * LICENSE.  If not, contact Office of Research, University of Notre
 * Dame, Notre Dame, IN 46556.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted subject to the conditions specified in the
 * LICENSE file.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * 
 * Additional copyrights may follow.
 * 
 *	Ohio Trollius
 *	Copyright 1997 The Ohio State University
 *	NJN
 *
 *	$Id: rpi_tcp.c,v 1.5.2.1 2001/09/30 20:17:05 bbarrett Exp $
 *
 *	Function:	- TCP client-to-client interface
 */

#include <lam_config.h>

#if !LAM_RPI_TCP

/* If we're not compiling the TCP RPI (e.g., it's one of the others
   that is only linking in this RPI so that it can be used for
   off-node communication), then effectively ignore this module. */

int lam_rpi_tcp_c_bogus_module = 1;

#else

/* Otherwise, compile in the whole module. */

#include <sfh.h>

#include <errno.h>
#include <stdlib.h>
#include <string.h>
#if HAVE_STRINGS_H
#include <strings.h>
#endif
#include <fcntl.h>
#include <unistd.h>
#include <sys/time.h>				/* LINUX FD_SET etc. */
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#if HAVE_NETINET_TCP_H
#include <netinet/tcp.h>
#endif
#include <sys/uio.h>
#if LAM_HAVE_FIONBIO
#include <sys/ioctl.h>
#endif

#if LAM_NEED_SYS_SELECT_H
#include <sys/select.h>
#endif

#include <app_mgmt.h>
#include <blktype.h>
#include <dl_inet.h>
#include <mpi.h>
#include <mpisys.h>
#include <net.h>
#include <rpisys.h>
#include <terror.h>
#include <typical.h>
#include <t_types.h>
#if LAM_WANT_IMPI
#include <impi.h>
#endif
#include <args.h>

#include <rpi_tcp.h>
#include <tcp_low.h>
#include <rpi_c2c.h>

/*
 * external functions
 */
extern int		sfh_sock_open_srv_inet_stm();
extern int		sfh_sock_open_clt_inet_stm();
extern int		sfh_sock_accept_tmout();
extern int		ldogetlinks();
extern void		lam_commfault();

/*
 * private functions
 */
static void		add_read(struct c2c_proc *ps,
				 MPI_Request req);
static void		add_write(struct c2c_proc *ps,
				  MPI_Request req);
static void		add_read_any_src(MPI_Request req);
static int		send_to_self(MPI_Request req_top,
				     MPI_Request send);
static int		send_to_self_match(MPI_Request send,
					   MPI_Request recv);
static int		connect_all(void);
static void		fill_sync(struct _proc *src,
				  struct _proc *dest,
				  struct nmsg *hea);
static void		proc_init(struct _proc *p);
static int		finalize1(struct _proc *p);

/*
 * global variables
 */
int                   _c2c_flblock;           /* blocking flag */
int                   _c2c_haveadv;           /* have advanced? */


/*
 *	_rpi_c2c_init
 *
 *	Function:	- primary initialiation of RPI subsystem
 *			- initialize buffering and socket connections
 *	Returns		- 0 or LAMERROR
 */
int
_rpi_c2c_init()

{
/*
 * Initialize unexpected message buffering.
 */
	if (_cbuf_init()) return(LAMERROR);
/*
 * Set up all processes for client-to-client communication.
 */
	if (_rpi_c2c_addprocs()) return(LAMERROR);

	return(0);
}

/*
 *	_rpi_c2c_addprocs
 *
 *	Function:	- setup for new processes
 *			- makes socket connections
 *	Returns:	- 0 or LAMERROR
 */
int
_rpi_c2c_addprocs()

{
	return(connect_all());
}

/*
 *	_rpi_c2c_build
 *
 *	Function:	- builds RPI portion of a request from scratch
 *			- one-time cost separated from _rpi_c2c_start()
 *			  to optimize persistent requests
 *	Accepts:	- request
 *	Returns:	- 0 or LAMERROR
 */
int
_rpi_c2c_build(req)

MPI_Request		req;

{
	req->rq_rpi.c2c.cq_envbuf = (char *) &req->rq_rpi.c2c.cq_env;
	return(0);
}

/*
 *	_rpi_c2c_start
 *
 *	Function:	- initializes RPI dependent aspects of a request
 *			- cost per request start - separated from
 *			  _rpi_c2c_build() to optimize persistent requests
 *	Accepts:	- request list
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_rpi_c2c_start(req_top, req)

MPI_Request		req_top;
MPI_Request		req;

{
    struct cbuf_msg	*bmsg;			/* buffered message */

    if (commdead_m(req)) return(0);
/*
 * Set common synchronization and communication parameters. The peer in
 * a receive request may be a wildcard but will be set to the actual
 * peer upon matching with an incoming mesage.
 */
    req->rq_rpi.c2c.cq_peer = req->rq_rank;
    req->rq_rpi.c2c.cq_env.ce_flags = 0;
    req->rq_rpi.c2c.cq_env.ce_tag = req->rq_tag;
    req->rq_rpi.c2c.cq_env.ce_cid = req->rq_cid;
/*
 * receive specific initialization
 */
    if ((req->rq_type == LAM_RQIRECV) || (req->rq_type == LAM_RQIPROBE)) {
	req->rq_rpi.c2c.cq_state = C2CREAD;
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_rpi.c2c.cq_peer;
/*
 * Check for matching buffered envelope/message. If one is found fill
 * in any receive request wildcards and advance the request.  
 */
	if ((bmsg = _cbuf_find(&req->rq_rpi.c2c.cq_env))) {
	    _c2c_fill_wildcards(req, &bmsg->cm_env);
	    return(_tcp_buffered_adv(req, bmsg));
	}
/*
 * No match was found. Set the request protocol transition function.
 */
	req->rq_rpi.c2c.cq_adv =
	    (req->rq_type == LAM_RQIRECV) ? _tcp_req_recv : _tcp_req_probe;
    }
/*
 * send specific initialization
 */
    else {
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_comm->c_group->g_myrank;
	req->rq_rpi.c2c.cq_env.ce_seq = req->rq_seq;
	req->rq_rpi.c2c.cq_env.ce_len = req->rq_packsize;

	if (req->rq_proc == lam_myproc) {
/*
 * send to self
 */
	    return(send_to_self(req_top, req));
	}
	else {
/*
 * send to another process
 */
	    req->rq_rpi.c2c.cq_state = C2CWRITE;
	    req->rq_rpi.c2c.cq_msgbuf = req->rq_packbuf;

	    if (req->rq_packsize > LAM_TCPSHORTMSGLEN) {
/*
 * long message protocol
 */
		req->rq_rpi.c2c.cq_env.ce_flags |= C2CLONG;
		req->rq_rpi.c2c.cq_adv = _tcp_req_send_long;

	    } else {
/*
 * short message protocol
 */
		req->rq_rpi.c2c.cq_nmsgout = req->rq_packsize;

		if (req->rq_type == LAM_RQISSEND) {
		    req->rq_rpi.c2c.cq_env.ce_flags |= C2CSSEND;
		    req->rq_rpi.c2c.cq_adv = _tcp_req_send_synch;
		} else {
		    req->rq_rpi.c2c.cq_adv = _tcp_req_send_short;
		}
	    }
/*
 * prepare for writing of envelope
 */
	    tcp_set_out_envelope_m(req->rq_rpi.c2c);
	}
    }

    return(0);
}

/*
 *	_rpi_c2c_destroy
 *
 *	Function:	- destroys RPI portion of request
 *	Accepts:	- request
 *	Returns:	- 0 or LAMERROR
 */
int
_rpi_c2c_destroy(req)

MPI_Request		req;

{
    return(0);
}

/*
 *	_rpi_c2c_advance
 *
 *	Function:	- advances requests in c2c mode
 *			- we try to advance all requests as far as possible
 *			  as allowed by RPI
 *	Accepts:	- request list
 *			- block enable flag
 *	Returns:	- 1: state change, 0: no state change, LAMERROR: error
 */
int
_rpi_c2c_advance(req_top, fl_block)

MPI_Request		req_top;
int			fl_block;

{
	MPI_Request		req;		/* current request */
/*
 * Find which requests require IO.
 */
	FD_ZERO(&_tcp_read);
	FD_ZERO(&_tcp_write);
	FD_ZERO(&_tcp_except);
	FD_ZERO(&_tcp_eoferr);
	_c2c_flblock = fl_block;
	_c2c_haveadv = 0;
	_tcp_nio = 0;
	_tcp_sockmax = -1;

	for (req = req_top; req; req = req->rq_next) {
/*
 * Start requests that are in the init state.
 */
		if (req->rq_state == LAM_RQSINIT) {
			if (_mpi_req_start(req) != MPI_SUCCESS) {
				return(LAMERROR);
			}
		}
/*
 * If a blocking request is done we may no longer block.
 */
		if (req->rq_state == LAM_RQSDONE) {
			if (req->rq_flags & LAM_RQFBLOCK) {
				_c2c_flblock = 0;
			}
			continue;
		}

		if (commdead_m(req)) continue;

		if (req->rq_rpi.c2c.cq_state == C2CWRITE) {
			add_write(&req->rq_proc->p_rpi.c2c, req);
		}
		else if (req->rq_rpi.c2c.cq_state == C2CREAD) {
			if (req->rq_proc == 0) {
				add_read_any_src(req);
			} else {
				add_read(&req->rq_proc->p_rpi.c2c, req);
			}
		}
	}

	if (_tcp_nio >= 1) {
		do {
			if (_tcp_nio == 1) {
				if (_tcp_adv1()) return(LAMERROR);
			} else {
				if (_tcp_advmultiple()) return(LAMERROR);
			}
		} while (_c2c_flblock && !_c2c_haveadv);
	}
	else if (lam_ger && _c2c_flblock) {
	    errno = EGERFLOW;
	    return(LAMERROR);
	}

	return(_c2c_haveadv);
}

/*
 *      _rpi_c2c_iprobe
 *
 *      Function:       - non-blocking probe
 *                      - public interface for peculiar MPI_Iprobe() which
 *                        does not return a request to the user
 *      Accepts:        - request
 *      Returns:        - 0: no msg, 1: msg, LAMERROR: error
 */
int
_rpi_c2c_iprobe(req)

MPI_Request             req;

{
	int			err;		/* error code */
/*
 * Link the probe request and advance as far as possible.
 */
	_mpi_req_add(req);
	_mpi_req_blkclr();
	err = _mpi_req_advance();
	if (err != MPI_SUCCESS) return(LAMERROR);
/*
 * Unlink the request.
 */
	_mpi_req_rem(req);
/*
 * A message was found if the request is in the done state.
 */
	return((req->rq_state == LAM_RQSDONE) ? 1 : 0);
}

/*
 *	_rpi_c2c_finalize
 *
 *	Function:	- c2c cleanup
 *	Accepts:	- process to cleanup (0 => all processes)
 *	Returns:	- 0 or LAMERROR
 */
int
_rpi_c2c_finalize(p)

struct _proc		*p;

{

	if (p) {
		return(finalize1(p));
	} else {
/*
 * Clean up buffers.
 */
		_cbuf_end();
/*
 * Loop through all processes closing connections.
 */
		for (p = lam_topproc(); p; p = lam_nextproc()) {
			if (finalize1(p)) {
				return(LAMERROR);
			}
		}		
	}

	return(0);
}

/*
 *	send_to_self
 *
 *	Function:	- advance send to self
 *	Accepts:	- request list
 *			- send request
 */
static int
send_to_self(req_top, send)

MPI_Request		req_top;
MPI_Request		send;

{
    MPI_Request     	recv;			/* receive request */
    struct cbuf_msg 	msg;			/* buffer list entry */
/*
 * Look for inactive matching receive/probe and advance if found.
 */
    for (recv = req_top; recv; recv = recv->rq_next) {

	if ((recv->rq_state == LAM_RQSSTART)
		&& (recv->rq_rpi.c2c.cq_state == C2CREAD)
		&& (!_c2c_envl_cmp(
		    &send->rq_rpi.c2c.cq_env, &recv->rq_rpi.c2c.cq_env))) {

	    if (send_to_self_match(send, recv)) {
		return(0);
	    }
	}
    }
/*
 * No matching receive found, buffer the whole message and the send is
 * done unless its a synchronous send in which case we use the user
 * buffer and the send only completes once a matching receive is posted.  
 */
    msg.cm_env = send->rq_rpi.c2c.cq_env;
    msg.cm_proc = 0;
 
    if (send->rq_type == LAM_RQISSEND) {
	send->rq_rpi.c2c.cq_state = C2CSENDSELF;
	msg.cm_buf = send->rq_packbuf;
	msg.cm_req = send;
    }
    else {
	if (send->rq_packsize > 0) {
	    if ((msg.cm_buf = (char *) malloc(send->rq_packsize)) == 0) {
		return(LAMERROR);
	    }
	    memcpy(msg.cm_buf, send->rq_packbuf, send->rq_packsize);
	} else {
	    msg.cm_buf = 0;
	}

	msg.cm_req = 0;
   	send->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;
    }

    return(_cbuf_append(&msg) ? 0 : LAMERROR);
}

/*
 *	send_to_self_match
 *
 *	Function:	- advance send and matching receive/probe
 *	Accepts:	- send request
 *			- receive/probe request
 *	Returns:	- 1: matched a receive, 0: matched a probe
 */
static int
send_to_self_match(send, recv)

MPI_Request		send;
MPI_Request		recv;

{
    int			len;			/* # bytes to transfer */

    recv->rq_seq = send->rq_seq;
    if (recv->rq_type == LAM_RQIPROBE) {
/*
 * The receive is actually a probe so the send is not complete.
 */
	_c2c_fill_mpi_status(recv, send->rq_rpi.c2c.cq_env.ce_rank,
		send->rq_rpi.c2c.cq_env.ce_tag, send->rq_rpi.c2c.cq_env.ce_len);
		
	recv->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;
	return(0);
    }
    else {
/*
 * It's really a receive. Do the data transfer.
 *
 * Check for mismatched message lengths.
 */
	if (send->rq_packsize > recv->rq_packsize) {
	    recv->rq_flags |= LAM_RQFTRUNC;
	    len = recv->rq_packsize;
	} else {
	    len = send->rq_packsize;
	}

	memcpy(recv->rq_packbuf, send->rq_packbuf, len);

	_c2c_fill_mpi_status(recv, send->rq_rpi.c2c.cq_env.ce_rank,
				send->rq_rpi.c2c.cq_env.ce_tag, len);

	send->rq_state = recv->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;
	return(1);
    }
}

/*
 *	add_write
 *
 *	Function:	- add process to write advance list
 *	Accepts:	- process
 *			- writing request
 */
static void
add_write(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
/*
 * Already added?
 */
	if (FD_ISSET(ps->cp_sock, &_tcp_write)) {
		return;
	}
/*
 * Associate request with process.
 */
	if (!ps->cp_wreq) {
		ps->cp_wreq = req;
	}

	_tcp_nio++;
	_tcp_lastreq = req;
	FD_SET(ps->cp_sock, &_tcp_write);
	FD_SET(ps->cp_sock, &_tcp_except);

	if (ps->cp_sock > _tcp_sockmax) {
		_tcp_sockmax = ps->cp_sock;
	}
}

/*
 *	add_read
 *
 *	Function:	- add process to read advance list
 *			- do not add in case process is self
 *	Accepts:	- process
 *			- request to start matching from
 */
static void
add_read(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	if (lam_ger && ps->cp_nbfde >= MPI_GER) {
		return;
	}

	if (ps->cp_sock >= 0) {
/*
 * Already added?
 */
		if (FD_ISSET(ps->cp_sock, &_tcp_read)) {
			return;
		}

		_tcp_nio++;
		_tcp_lastreq = req;
		ps->cp_mreq = req;
		FD_SET(ps->cp_sock, &_tcp_read);
		FD_SET(ps->cp_sock, &_tcp_except);
    
		if (ps->cp_sock > _tcp_sockmax) {
			_tcp_sockmax = ps->cp_sock;
		}
	}
}

/*
 *	add_read_any_src
 *
 *	Function:	- add to the read advance list all processes in
 *			  the peer group of a receive request on MPI_ANY_SOURCE
 *	Accepts:	- request
 */
static void
add_read_any_src(req)

MPI_Request		req;

{
	struct _group	*g;			/* peer group */
	struct _proc	**p;
	int		i;

	g = (LAM_IS_INTER(req->rq_comm))
		? req->rq_comm->c_rgroup : req->rq_comm->c_group;

	for (i = g->g_nprocs, p = g->g_procs; i > 0; i--, p++) {
#if LAM_WANT_IMPI
/*
 * Special case for IMPI -- if we're receiving from the impid proxy
 * entry in the group, replace it with the One True IMPID Proc.
 */
	  if (LAM_GPSCMP(&(*p)->p_gps, &gimpid) == 0 && lam_impid_proc != 0)
	    add_read(&(lam_impid_proc->p_rpi.c2c), req);
	  else
	    add_read(&(*p)->p_rpi.c2c, req);
#else
	  add_read(&(*p)->p_rpi.c2c, req);
#endif
	}
}

/*
 *	_c2c_fill_wildcards
 *
 *	Function:	- replace wildcards in request with matched values
 *			  and fill in the sequence number
 *	Accepts:	- request
 *			- matched envelope
 */
void
_c2c_fill_wildcards(req, env)

MPI_Request		req;
struct c2c_envl		*env;

{
	struct _group	*g;			/* peer group */
		
	req->rq_seq = env->ce_seq;
    
	if (req->rq_rpi.c2c.cq_env.ce_tag == MPI_ANY_TAG) {
		req->rq_rpi.c2c.cq_env.ce_tag = env->ce_tag;
	}

	if (req->rq_rpi.c2c.cq_peer == MPI_ANY_SOURCE) {
		req->rq_rpi.c2c.cq_peer = env->ce_rank;
		req->rq_rpi.c2c.cq_env.ce_rank = env->ce_rank;
	
		g = (LAM_IS_INTER(req->rq_comm))
			? req->rq_comm->c_rgroup : req->rq_comm->c_group;

		req->rq_proc = g->g_procs[req->rq_rpi.c2c.cq_peer];
	}
}

/*
 *	_c2c_fill_mpi_status
 *
 *	Function:	- fill in the MPI status object
 *	Accepts:	- request
 *			- rank
 *			- tag
 *			- message length
 */
void
_c2c_fill_mpi_status(req, rank, tag, length)

MPI_Request		req;
int			rank;
int			tag;
int			length;

{
	req->rq_status.MPI_SOURCE = rank;
	req->rq_status.MPI_TAG = tag;
	req->rq_status.st_length = length;
}

/*
 *	_c2c_envl_cmp
 *
 *	Function:	- check if envelopes match
 *			- second envelope may contain wildcards and first
 *			  may not
 *	Accepts:	- ptr to envelope
 *			- ptr to request envelope
 *	Returns:	- 0 if match, 1 if not
 */
int
_c2c_envl_cmp(pe, pq)

struct c2c_envl		*pe, *pq;

{
    if ((pe->ce_cid == pq->ce_cid)
	&& ((pe->ce_rank == pq->ce_rank) || (pq->ce_rank == MPI_ANY_SOURCE))
	&& ((pe->ce_tag == pq->ce_tag) || (pq->ce_tag == MPI_ANY_TAG))
	&& ((pe->ce_flags & C2CACK) == (pq->ce_flags & C2CACK))
	&& ((pe->ce_flags & C2C2ND) == (pq->ce_flags & C2C2ND))) {

	return(0);
    }

    return(1);
}

/*
 *      _c2c_comm_dead
 *
 *      Function:       - sets dead communicator error for request
 *      Accepts:        - request
 *      Returns:        - 1
 */
int
_c2c_comm_dead(req)

MPI_Request             req;

{
	if (req->rq_state != LAM_RQSDONE && req->rq_state != LAM_RQSINIT) {
		lam_rq_nactv--;
	}

	req->rq_state = LAM_RQSDONE;
	_c2c_haveadv = 1;

	if (req->rq_comm->c_flags & LAM_CLDEAD) {
		req->rq_status.MPI_ERROR = lam_mkerr(MPI_ERR_LOCALDEAD, 0);
	} else {
		req->rq_status.MPI_ERROR = lam_mkerr(MPI_ERR_REMOTEDEAD, 0);
	}

	return(1);
}

/*
 *	connect_all
 *
 *	Function:	- make tcp connections to all other processes
 *	Returns:	- 0 or LAMERROR
 */
static int
connect_all()

{
    struct _proc	*p;
    struct _gps		*mygps;			/* my GPS */
    struct nmsg		inmsg;			/* incoming network msg hdr */
    struct nmsg		outmsg;			/* outgoing network msg hdr */
    struct dolink	*links;			/* links to neighbours */
    int4		nlinks;			/* number of links */
    int			sock;			/* socket descriptor */
    int			servsockd;		/* server socket descriptor */
    int			servport = 0;		/* server port number */
    int			rnode;			/* remote node */
    int			rport;			/* remote port */
    int 		flag;			/* for setting socket opts */
    int			bufsize;		/* c2c socket buffer size */
    unsigned char	*raddr;			/* remote host address */
    char                myrankstr[32];

    LAM_ZERO_ME(inmsg);
    LAM_ZERO_ME(outmsg);
/*
 * Things to make -Wall not complain
 */
    mygps = &lam_myproc->p_gps;
    memset(myrankstr, 0, 32);
    snprintf(myrankstr, 31, "%d", mygps->gps_grank);
    bufsize = LAM_TCPSHORTMSGLEN + sizeof(struct c2c_envl);
    servsockd = -1;

    if (lam_nprocs() > 1) {
/*
 * Get links to neighbours, initialize server socket, message headers, etc.
 */
	if (ldogetlinks(&links, &nlinks)) return(LAMERROR);

	servsockd = sfh_sock_open_srv_inet_stm(&servport);
	if (servsockd < 0) {
	  show_help("rpi.tcp", "open-server-socket", myrankstr, NULL);
	  return(LAMERROR);
	}

	inmsg.nh_flags = 0;
	inmsg.nh_length = 0;
	outmsg.nh_length = 0;
	outmsg.nh_flags = 0;
	outmsg.nh_data[0] = (int4) servport;
    }
/*
 * Loop through all processes, initializing the process data and
 * connecting to those not already connected to.
 */
    for (p = lam_topproc(); p; p = lam_nextproc()) {

	if (p->p_mode & LAM_PRPIINIT) {
	    continue;
	}	
	proc_init(p);
	
	if (p != lam_myproc) {
		
	    if (LAM_GPSCMP(mygps, &p->p_gps) >= 0) {
/*
 * Act as a client.  
 */
		fill_sync(p, lam_myproc, &inmsg);
		if (nrecv(&inmsg)) return(LAMERROR);

		rport = (int) inmsg.nh_data[0];
		
		rnode = p->p_gps.gps_node;
				
		if (rnode > nlinks) return(LAMERROR);

		raddr = (unsigned char *)
			&links[rnode].dol_addr.sin_addr.s_addr;

		sock = sfh_sock_open_clt_inet_stm(raddr, rport);
		if (sock < 0) {
		  int save = errno;
		  if (servsockd > 0)
		    close(servsockd);
		  errno = save;
		  show_help("rpi.tcp", "open-client-socket", myrankstr, NULL);
		  return(LAMERROR);
		}
	    }
	    else {
/*
 * Act as a server.
 */
		fill_sync(lam_myproc, p, &outmsg);
		if (nsend(&outmsg)) return(LAMERROR);
				
		sock = sfh_sock_accept_tmout(servsockd, -1);
		if (sock < 0) {
		  int save = errno;
		  if (servsockd > 0)
		    close(servsockd);
		  errno = save;
		  show_help("rpi.tcp", "accept-server-socket", myrankstr, 
			    NULL);
		  return(LAMERROR);
		}
	    }

	    p->p_rpi.c2c.cp_sock = sock;
	    _tcp_smap[sock] = &p->p_rpi.c2c;
/*
 * Set sockets in non-blocking mode and set the send and receive buffer sizes.
 */
	    flag = 1;
#if LAM_HAVE_FIONBIO
	    if (ioctl(sock, FIONBIO, &flag) == -1)
	      return(LAMERROR);
#else
	    if (fcntl(sock, F_SETFL, O_NONBLOCK) == -1)
	      return(LAMERROR);
#endif
	    FD_CLR(sock, &_tcp_block);

	    if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, 
				(char *) &flag, sizeof(flag))) {
		return(LAMERROR);
	    }

	    if (sfh_sock_set_buf_size(sock, SFH_INET, SO_SNDBUF, bufsize)) {
		return(LAMERROR);
	    }

	    if (sfh_sock_set_buf_size(sock, SFH_INET, SO_RCVBUF, bufsize)) {
		return(LAMERROR);
	    }
	}
    }

    if (lam_nprocs() > 1) {
	close(servsockd);
	if (links != 0)
	  free(links);
    }

    return(0);
}

/*
 *	proc_init
 *
 *	Function:	- initialize c2c specific process data
 *	Accepts:	- process
 */
static void
proc_init(p)

struct _proc	*p;

{
	p->p_mode |= LAM_PRPIINIT;
	p->p_rpi.c2c.cp_sock = -1;
	p->p_rpi.c2c.cp_mreq = 0;
	p->p_rpi.c2c.cp_rreq = 0;
	p->p_rpi.c2c.cp_wreq = 0;
	p->p_rpi.c2c.cp_nbfde = 0;
	p->p_rpi.c2c.cp_extra = 0;
/*
 * Set up to read in an envelope.
 */
	p->p_rpi.c2c.cp_readfn = _tcp_proc_read_env;
	p->p_rpi.c2c.cp_envbuf = (char *) &p->p_rpi.c2c.cp_env;
	p->p_rpi.c2c.cp_nenvin = sizeof(struct c2c_envl);
}

/*
 *	fill_sync
 *
 *	Function:	- fill in network message sync for connecting
 *	Accepts:	- source process
 *			- destination process
 *			- network message header (filled)
 */
static void
fill_sync(src, dest, head)

struct _proc            *src;
struct _proc            *dest;
struct nmsg             *head;

{
/*
 * This uses in effect synchronization MPI_COMM_WORLD and tag 0.
 */
	_m2l_fillsys(src->p_gps.gps_node, src->p_gps.gps_idx,
		dest->p_gps.gps_node, dest->p_gps.gps_idx, 0, 0, head);
}

/*
 *	finalize1
 *
 *	Function:	- cleanup a process
 *	Accepts:	- process
 *	Returns:	- 0 or LAMERROR
 */
static int
finalize1(p)

struct _proc		*p;

{
	if (p->p_rpi.c2c.cp_sock >= 0) {
		shutdown(p->p_rpi.c2c.cp_sock, 2);
		close(p->p_rpi.c2c.cp_sock);
		p->p_rpi.c2c.cp_sock = -1;
	}

	return(0);
}


/*
 *	_rpi_c2c_fastsend
 *
 *	Function:	- fast blocking send
 *	Accepts:	- buffer to send
 *			- message count
 *			- message datatype
 *			- destination process rank
 *			- message tag
 *			- message communicator
 *	Returns:	- MPI_SUCCESS or error code
 */
int
_rpi_c2c_fastsend(buf, count, dtype, dest, tag, comm)

char			*buf;
int			count;
MPI_Datatype		dtype;
int			dest;
int			tag;
MPI_Comm		comm;

{
    double		local_rep;		/* local data representation */
    double		net_rep;		/* net data representation */
    struct _proc	*destproc;
    char		*packbuf;
    int			packsize;
    int			err;
/*
 * Check common arguments.
 */
    if (count < 0) {
	return(lam_mkerr(MPI_ERR_COUNT, 0));
    }

    if (dtype == MPI_DATATYPE_NULL || (!dtype->dt_commit)) {
	return(lam_mkerr(MPI_ERR_TYPE, 0));
    }

    if (LAM_IS_INTER(comm)) {
	if ((dest < 0) || (dest >= comm->c_rgroup->g_nprocs)) {
	    return(lam_mkerr(MPI_ERR_RANK, 0));
	}

	destproc = comm->c_rgroup->g_procs[dest];
    }
    else {
	if ((dest < 0) || (dest >= comm->c_group->g_nprocs)) {
	    return(lam_mkerr(MPI_ERR_RANK, 0));
	}

	destproc = comm->c_group->g_procs[dest];
    }
/*
 * Handle zero length messages.
 */
    if (count == 0 || dtype->dt_size == 0) {
	packbuf = buf;
	packsize = 0;
    }
    else {
	local_rep = 1.1;
	ltotf8(&local_rep, &net_rep);
/*
 * If contiguous, use the caller's buffer.
 */
	packsize = count * dtype->dt_size;

	if ((dtype->dt_flags & LAM_DTNOPACK)
		&& ((dtype->dt_flags & LAM_DTNOXADJ) || count == 1)
		&& ((local_rep == net_rep) || lam_homog)) {
	    packbuf = buf;
/*
 * Check for bad buffer.
 */
	    if (packbuf == 0) {
		return(lam_mkerr(MPI_ERR_BUFFER, 0));
	    }
	}
/*
 * Otherwise allocate a buffer.
 */
	else {
	    packbuf = malloc(packsize);
	    if (packbuf == 0) {
		return(lam_mkerr(MPI_ERR_OTHER, errno));
	    }

	    if (lam_pack(buf, count, dtype, packbuf, packsize) < 0) {
		return(lam_mkerr(MPI_ERR_INTERN, errno));
	    }
	}
    }

    err = _tcp_fastsend(packbuf, packsize,
			&destproc->p_rpi.c2c, dest, tag, comm);

    if (packbuf != buf) {
	free(packbuf);
    }

    return(err);
}


/*
 *	_rpi_c2c_fastrecv
 *
 *	Function:	- fast blocking receive
 *	Accepts:	- buffer to receive into
 *			- message count
 *			- message datatype
 *			- source process rank
 *			- message tag (inout)
 *			- message communicator
 *			- status (out)
 *			- seqnum (out)
 *	Returns:	- MPI_SUCCESS or error code
 */
int
_rpi_c2c_fastrecv(buf, count, dtype, src, tag, comm, stat, seqnum)

char			*buf;
int			count;
MPI_Datatype		dtype;
int			src;
int			*tag;
MPI_Comm		comm;
MPI_Status		*stat;
int			*seqnum;

{
    double		local_rep;		/* local data representation */
    double		net_rep;		/* net data representation */
    struct _proc	*srcproc;
    char		*packbuf;
    int			packsize;
    int			err;
/*
 * Check common arguments.
 */
    if (count < 0) {
	return(lam_mkerr(MPI_ERR_COUNT, 0));
    }

    if (dtype == MPI_DATATYPE_NULL || (!dtype->dt_commit)) {
	return(lam_mkerr(MPI_ERR_TYPE, 0));
    }

    if (LAM_IS_INTER(comm)) {
	if ((src < 0) || (src >= comm->c_rgroup->g_nprocs)) {
	    return(lam_mkerr(MPI_ERR_RANK, 0));
	}

	srcproc = comm->c_rgroup->g_procs[src];
    }
    else {
	if ((src < 0) || (src >= comm->c_group->g_nprocs)) {
	    return(lam_mkerr(MPI_ERR_RANK, 0));
	}

	srcproc = comm->c_group->g_procs[src];
    }
/*
 * Handle zero length messages.
 */
    if (count == 0 || dtype->dt_size == 0) {
	packbuf = buf;
	packsize = 0;
    }
    else {
	local_rep = 1.1;
	ltotf8(&local_rep, &net_rep);
/*
 * If contiguous, use the caller's buffer.
 */
	packsize = count * dtype->dt_size;

	if ((dtype->dt_flags & LAM_DTNOPACK)
		&& ((dtype->dt_flags & LAM_DTNOXADJ) || count == 1)
		&& ((local_rep == net_rep) || lam_homog)) {
	    packbuf = buf;
/*
 * Check for bad buffer.
 */
	    if (packbuf == 0) {
		return(lam_mkerr(MPI_ERR_BUFFER, 0));
	    }
	}
/*
 * Otherwise allocate a buffer.
 */
	else {
	    packbuf = malloc(packsize);
	    if (packbuf == 0) {
		return(lam_mkerr(MPI_ERR_OTHER, errno));
	    }
	}
    }

    err = _tcp_fastrecv(packbuf, &packsize,
			&srcproc->p_rpi.c2c, src, tag, comm, seqnum);

    if (stat != MPI_STATUS_IGNORE) {
	stat->MPI_SOURCE = src;
	stat->MPI_TAG = *tag;
	stat->MPI_ERROR = err;
	stat->st_length = packsize;
    }

    if (packbuf != buf) {
	if (lam_unpack(packbuf, packsize, buf, count, dtype) < 0) {
	    return(lam_mkerr(MPI_ERR_INTERN, errno));
	}

	free(packbuf);
    }

    return(err);
}

#endif /* LAM_RPI_TCP */
