Need USB CPPI DMA patch for DaVinci (2.6.23 git kernel)

paddy padmanabha at tesbv.com
Thu Sep 4 08:52:56 CDT 2008


Frank Xue wrote:
> Paddy,
>    I got this patch from TI engineer - Swami. I applied and tested it 
> on our
> custom board. it works ok. you can try it. You can browser Neuros git 
> repo
> also. The url is:
>    http://git.neurostechnology.com/ 
> Frank
>
> paddy wrote:
>> Hi frank,
>>
>> I am working on Davinci platform & using 2.6.23 git kernel. Can you
>> please provide me the USB CPPI DMA patch for 2.6.23 git kernel.
>> It will help me to measure USB performance with DMA enabled
>> on our custom board.
>>
>> Thanks and Regards,
>> Padmanabha.s
>>
>> _______________________________________________
>> Davinci-linux-open-source mailing list
>> Davinci-linux-open-source at linux.davincidsp.com
>> http://linux.davincidsp.com/mailman/listinfo/davinci-linux-open-source
>>
>
> ------------------------------------------------------------------------
>
> >From 07e429485302b6fcbe04ade75a707c3114ecaae5 Mon Sep 17 00:00:00 2001
> From: =?utf-8?q?=E8=96=9B=E5=BE=B7=E7=AB=A0?= <frank.xue at neuros.com.cn>
> Date: Fri, 6 Jun 2008 09:54:06 +0800
> Subject: [PATCH] USB CPPI DMA patch for davinci
> MIME-Version: 1.0
> Content-Type: text/plain; charset=utf-8
> Content-Transfer-Encoding: 8bit
>
> This patch fixed USB unstability issue while writing
> huge amount of data into usb device.
>
> Signed-off-by: è-->å¾·ç«  <frank.xue at neuros.com.cn>
> ---
>  drivers/usb/musb/cppi_dma.c  | 1317 ++++++++++++++++++-----------------------
>  drivers/usb/musb/cppi_dma.h  |  158 +++--
>  drivers/usb/musb/musb_core.c |   31 +-
>  drivers/usb/musb/musb_core.h |   17 +
>  drivers/usb/musb/musb_host.c |   68 ++-
>  drivers/usb/musb/musb_host.h |    5 +-
>  6 files changed, 774 insertions(+), 822 deletions(-)
>
> diff --git a/drivers/usb/musb/cppi_dma.c b/drivers/usb/musb/cppi_dma.c
> index 71c42db..5980fad 100644
> --- a/drivers/usb/musb/cppi_dma.c
> +++ b/drivers/usb/musb/cppi_dma.c
> @@ -3,35 +3,20 @@
>   *
>   * This file implements a DMA  interface using TI's CPPI DMA.
>   * For now it's DaVinci-only, but CPPI isn't specific to DaVinci or USB.
> - * The TUSB6020, using VLYNQ, has CPPI that looks much like DaVinci.
>   */
>  
>  #include <linux/usb.h>
>  
>  #include "musb_core.h"
> +#include "musb_host.h"
>  #include "cppi_dma.h"
>  
> -
> -/* CPPI DMA status 7-mar-2006:
> +/* CPPI DMA status 7-mar:
>   *
>   * - See musb_{host,gadget}.c for more info
>   *
>   * - Correct RX DMA generally forces the engine into irq-per-packet mode,
>   *   which can easily saturate the CPU under non-mass-storage loads.
> - *
> - * NOTES 24-aug-2006 (2.6.18-rc4):
> - *
> - * - peripheral RXDMA wedged in a test with packets of length 512/512/1.
> - *   evidently after the 1 byte packet was received and acked, the queue
> - *   of BDs got garbaged so it wouldn't empty the fifo.  (rxcsr 0x2003,
> - *   and RX DMA0: 4 left, 80000000 8feff880, 8feff860 8feff860; 8f321401
> - *   004001ff 00000001 .. 8feff860)  Host was just getting NAKed on tx
> - *   of its next (512 byte) packet.  IRQ issues?
> - *
> - * REVISIT:  the "transfer DMA" glue between CPPI and USB fifos will
> - * evidently also directly update the RX and TX CSRs ... so audit all
> - * host and peripheral side DMA code to avoid CSR access after DMA has
> - * been started.
>   */
>  
>  /* REVISIT now we can avoid preallocating these descriptors; or
> @@ -76,29 +61,18 @@ cppi_bd_free(struct cppi_channel *c, struct cppi_descriptor *bd)
>   *  Initialize the DMA controller as necessary.
>   */
>  
> -/* zero out entire rx state RAM entry for the channel */
> -static void cppi_reset_rx(struct cppi_rx_stateram __iomem *rx)
> -{
> -	musb_writel(&rx->rx_skipbytes, 0, 0);
> -	musb_writel(&rx->rx_head, 0, 0);
> -	musb_writel(&rx->rx_sop, 0, 0);
> -	musb_writel(&rx->rx_current, 0, 0);
> -	musb_writel(&rx->rx_buf_current, 0, 0);
> -	musb_writel(&rx->rx_len_len, 0, 0);
> -	musb_writel(&rx->rx_cnt_cnt, 0, 0);
> -}
> +#define	CAST (void *__force __iomem)
>  
> -/* zero out entire tx state RAM entry for the channel */
> -static void cppi_reset_tx(struct cppi_tx_stateram __iomem *tx, u32 ptr)
> +/* zero out entire rx state RAM entry for the channel */
> +static void cppi_reset_rx(struct cppi_rx_stateram *__iomem rx)
>  {
> -	musb_writel(&tx->tx_head, 0, 0);
> -	musb_writel(&tx->tx_buf, 0, 0);
> -	musb_writel(&tx->tx_current, 0, 0);
> -	musb_writel(&tx->tx_buf_current, 0, 0);
> -	musb_writel(&tx->tx_info, 0, 0);
> -	musb_writel(&tx->tx_rem_len, 0, 0);
> -	/* musb_writel(&tx->tx_dummy, 0, 0); */
> -	musb_writel(&tx->tx_complete, 0, ptr);
> +	musb_writel(CAST & rx->rx_skipbytes, 0, 0);
> +	musb_writel(CAST & rx->rx_head, 0, 0);
> +	musb_writel(CAST & rx->rx_sop, 0, 0);
> +	musb_writel(CAST & rx->rx_current, 0, 0);
> +	musb_writel(CAST & rx->rx_buf_current, 0, 0);
> +	musb_writel(CAST & rx->rx_len_len, 0, 0);
> +	musb_writel(CAST & rx->rx_cnt_cnt, 0, 0);
>  }
>  
>  static void __init cppi_pool_init(struct cppi *cppi, struct cppi_channel *c)
> @@ -149,7 +123,7 @@ static void cppi_pool_free(struct cppi_channel *c)
>  static int __init cppi_controller_start(struct dma_controller *c)
>  {
>  	struct cppi	*controller;
> -	void __iomem	*tibase;
> +	void *__iomem	tibase;
>  	int		i;
>  
>  	controller = container_of(c, struct cppi, controller);
> @@ -170,29 +144,40 @@ static int __init cppi_controller_start(struct dma_controller *c)
>  	for (i = 0; i < ARRAY_SIZE(controller->rx); i++)
>  		cppi_pool_init(controller, controller->rx + i);
>  
> -	tibase =  controller->tibase;
> +	/* Do Necessary configuartion in H/w to get started */
> +	tibase = controller->mregs - DAVINCI_BASE_OFFSET;
> +
>  	INIT_LIST_HEAD(&controller->tx_complete);
>  
>  	/* initialise tx/rx channel head pointers to zero */
>  	for (i = 0; i < ARRAY_SIZE(controller->tx); i++) {
>  		struct cppi_channel	*tx_ch = controller->tx + i;
> -		struct cppi_tx_stateram __iomem *tx;
> +		struct cppi_tx_stateram *__iomem tx;
>  
>  		INIT_LIST_HEAD(&tx_ch->tx_complete);
>  
>  		tx = tibase + DAVINCI_TXCPPI_STATERAM_OFFSET(i);
>  		tx_ch->state_ram = tx;
> -		cppi_reset_tx(tx, 0);
> +		/* zero out entire state RAM entry for the channel */
> +		tx->tx_head = 0;
> +		tx->tx_buf = 0;
> +		tx->tx_current = 0;
> +		tx->tx_buf_current = 0;
> +		tx->tx_info = 0;
> +		tx->tx_rem_len = 0;
> +		/*txState->dummy = 0; */
> +		tx->tx_complete = 0;
> +
>  	}
>  	for (i = 0; i < ARRAY_SIZE(controller->rx); i++) {
>  		struct cppi_channel	*rx_ch = controller->rx + i;
> -		struct cppi_rx_stateram __iomem *rx;
> +		struct cppi_rx_stateram *__iomem rx;
>  
>  		INIT_LIST_HEAD(&rx_ch->tx_complete);
>  
>  		rx = tibase + DAVINCI_RXCPPI_STATERAM_OFFSET(i);
>  		rx_ch->state_ram = rx;
> -		cppi_reset_rx(rx);
> +		cppi_reset_rx(rx_ch->state_ram);
>  	}
>  
>  	/* enable individual cppi channels */
> @@ -205,11 +190,10 @@ static int __init cppi_controller_start(struct dma_controller *c)
>  	musb_writel(tibase, DAVINCI_TXCPPI_CTRL_REG, DAVINCI_DMA_CTRL_ENABLE);
>  	musb_writel(tibase, DAVINCI_RXCPPI_CTRL_REG, DAVINCI_DMA_CTRL_ENABLE);
>  
> -	/* disable RNDIS mode, also host rx RNDIS autorequest */
> -	musb_writel(tibase, DAVINCI_RNDIS_REG, 0);
> +	/* disable RNDIS mode */
>  	musb_writel(tibase, DAVINCI_AUTOREQ_REG, 0);
>  
> -	return 0;
> +	return true;
>  }
>  
>  /*
> @@ -225,8 +209,7 @@ static int cppi_controller_stop(struct dma_controller *c)
>  	int			i;
>  
>  	controller = container_of(c, struct cppi, controller);
> -
> -	tibase = controller->tibase;
> +	tibase = controller->mregs - DAVINCI_BASE_OFFSET;
>  	/* DISABLE INDIVIDUAL CHANNEL Interrupts */
>  	musb_writel(tibase, DAVINCI_TXCPPI_INTCLR_REG,
>  			DAVINCI_DMA_ALL_CHANNELS_ENABLE);
> @@ -253,184 +236,143 @@ static int cppi_controller_stop(struct dma_controller *c)
>  	return 0;
>  }
>  
> -/* While dma channel is allocated, we only want the core irqs active
> - * for fault reports, otherwise we'd get irqs that we don't care about.
> - * Except for TX irqs, where dma done != fifo empty and reusable ...
> - *
> - * NOTE: docs don't say either way, but irq masking **enables** irqs.
> - *
> - * REVISIT same issue applies to pure PIO usage too, and non-cppi dma...
> - */
> -static inline void core_rxirq_disable(void __iomem *tibase, unsigned epnum)
> -{
> -	musb_writel(tibase, DAVINCI_USB_INT_MASK_CLR_REG, 1 << (epnum + 8));
> -}
> -
> -static inline void core_rxirq_enable(void __iomem *tibase, unsigned epnum)
> -{
> -	musb_writel(tibase, DAVINCI_USB_INT_MASK_SET_REG, 1 << (epnum + 8));
> -}
> -
> -
>  /*
>   * Allocate a CPPI Channel for DMA.  With CPPI, channels are bound to
>   * each transfer direction of a non-control endpoint, so allocating
>   * (and deallocating) is mostly a way to notice bad housekeeping on
>   * the software side.  We assume the irqs are always active.
>   */
> -static struct dma_channel *
> -cppi_channel_allocate(struct dma_controller *c,
> -		struct musb_hw_ep *ep, u8 transmit)
> +static struct dma_channel *cppi_channel_allocate(struct dma_controller *c,
> +					 struct musb_hw_ep *ep, u8 transmit)
>  {
> -	struct cppi		*controller;
> -	u8			index;
> -	struct cppi_channel	*cppi_ch;
> -	void __iomem		*tibase;
> +	struct cppi		 *controller;
> +	u8			 chnum;
> +	struct cppi_channel	 *cppi_ch;
>  
>  	controller = container_of(c, struct cppi, controller);
> -	tibase = controller->tibase;
> +	/* remember bLocalEnd: 1..Max_EndPt, and cppi ChNum:0..Max_EndPt-1 */
> +	chnum = ep->epnum  - 1;
>  
> -	/* ep0 doesn't use DMA; remember cppi indices are 0..N-1 */
> -	index = ep->epnum - 1;
> -
> -	/* return the corresponding CPPI Channel Handle, and
> -	 * probably disable the non-CPPI irq until we need it.
> -	 */
> +	/* as of now, just return the corresponding CPPI Channel Handle */
>  	if (transmit) {
> -		if (index >= ARRAY_SIZE(controller->tx)) {
> -			DBG(1, "no %cX%d CPPI channel\n", 'T', index);
> +		if (chnum  > ARRAY_SIZE(controller->tx)) {
> +			DBG(1, "no %cX DMA channel for ep%d\n", 'T', chnum);
>  			return NULL;
>  		}
> -		cppi_ch = controller->tx + index;
> +		cppi_ch = controller->tx + chnum;
>  	} else {
> -		if (index >= ARRAY_SIZE(controller->rx)) {
> -			DBG(1, "no %cX%d CPPI channel\n", 'R', index);
> +		if (chnum > ARRAY_SIZE(controller->rx)) {
> +			DBG(1, "no %cX DMA channel for ep%d\n", 'R', chnum);
>  			return NULL;
>  		}
> -		cppi_ch = controller->rx + index;
> -		core_rxirq_disable(tibase, ep->epnum);
> +		cppi_ch = controller->rx + chnum;
>  	}
>  
>  	/* REVISIT make this an error later once the same driver code works
> -	 * with the other DMA engine too
> +	 * with the Mentor DMA engine too
>  	 */
>  	if (cppi_ch->hw_ep)
> -		DBG(1, "re-allocating DMA%d %cX channel %p\n",
> -				index, transmit ? 'T' : 'R', cppi_ch);
> +		DBG(6, "re-allocating DMA%d %cX channel %p\n",
> +		    chnum, transmit ? 'T' : 'R', cppi_ch);
>  	cppi_ch->hw_ep = ep;
>  	cppi_ch->channel.status = MUSB_DMA_STATUS_FREE;
>  
> -	DBG(4, "Allocate CPPI%d %cX\n", index, transmit ? 'T' : 'R');
> +	DBG(4, "Allocate CPPI%d %cX\n", chnum, transmit ? 'T' : 'R');
>  	return &cppi_ch->channel;
>  }
>  
>  /* Release a CPPI Channel.  */
>  static void cppi_channel_release(struct dma_channel *channel)
>  {
> -	struct cppi_channel	*c;
> -	void __iomem		*tibase;
> +	struct cppi_channel *c;
>  
>  	/* REVISIT:  for paranoia, check state and abort if needed... */
>  
>  	c = container_of(channel, struct cppi_channel, channel);
> -	tibase = c->controller->tibase;
>  	if (!c->hw_ep)
>  		DBG(1, "releasing idle DMA channel %p\n", c);
> -	else if (!c->transmit)
> -		core_rxirq_enable(tibase, c->index + 1);
>  
> -	/* for now, leave its cppi IRQ enabled (we won't trigger it) */
> +	/* but for now, not its IRQ */
>  	c->hw_ep = NULL;
>  	channel->status = MUSB_DMA_STATUS_UNKNOWN;
>  }
>  
>  /* Context: controller irqlocked */
> -static void
> -cppi_dump_rx(int level, struct cppi_channel *c, const char *tag)
> +static void cppi_dump_rx(int level, struct cppi_channel *c, const char *tag)
>  {
> -	void __iomem			*base = c->controller->mregs;
> -	struct cppi_rx_stateram __iomem	*rx = c->state_ram;
> +	void *__iomem base = c->controller->mregs;
>  
>  	musb_ep_select(base, c->index + 1);
>  
>  	DBG(level, "RX DMA%d%s: %d left, csr %04x, "
> -			"%08x H%08x S%08x C%08x, "
> -			"B%08x L%08x %08x .. %08x"
> -			"\n",
> -		c->index, tag,
> -		musb_readl(c->controller->tibase,
> -			DAVINCI_RXCPPI_BUFCNT0_REG + 4 *c->index),
> +		"%08x H%08x S%08x C%08x, " "B%08x L%08x %08x .. %08x"
> +		"\n",
> +		c->index, tag, musb_readl(base - DAVINCI_BASE_OFFSET,
> +		DAVINCI_RXCPPI_BUFCNT0_REG + 4 * c->index),
>  		musb_readw(c->hw_ep->regs, MUSB_RXCSR),
> -
> -		musb_readl(&rx->rx_skipbytes, 0),
> -		musb_readl(&rx->rx_head, 0),
> -		musb_readl(&rx->rx_sop, 0),
> -		musb_readl(&rx->rx_current, 0),
> -
> -		musb_readl(&rx->rx_buf_current, 0),
> -		musb_readl(&rx->rx_len_len, 0),
> -		musb_readl(&rx->rx_cnt_cnt, 0),
> -		musb_readl(&rx->rx_complete, 0)
> -		);
> +		musb_readl(c->state_ram, 0 * 4),	/* buf offset */
> +		musb_readl(c->state_ram, 1 * 4),	/* head ptr */
> +		musb_readl(c->state_ram, 2 * 4),	/* sop bd */
> +		musb_readl(c->state_ram, 3 * 4),	/* current bd */
> +		musb_readl(c->state_ram, 4 * 4),	/* current buf */
> +		musb_readl(c->state_ram, 5 * 4),	/* pkt len */
> +		musb_readl(c->state_ram, 6 * 4),	/* byte cnt */
> +		musb_readl(c->state_ram, 7 * 4)		/* completion */
> +	    );
>  }
>  
>  /* Context: controller irqlocked */
> -static void
> -cppi_dump_tx(int level, struct cppi_channel *c, const char *tag)
> +static void cppi_dump_tx(int level, struct cppi_channel *c, const char *tag)
>  {
> -	void __iomem			*base = c->controller->mregs;
> -	struct cppi_tx_stateram __iomem	*tx = c->state_ram;
> +	void *__iomem base = c->controller->mregs;
>  
>  	musb_ep_select(base, c->index + 1);
>  
>  	DBG(level, "TX DMA%d%s: csr %04x, "
> -			"H%08x S%08x C%08x %08x, "
> -			"F%08x L%08x .. %08x"
> -			"\n",
> -		c->index, tag,
> -		musb_readw(c->hw_ep->regs, MUSB_TXCSR),
> -
> -		musb_readl(&tx->tx_head, 0),
> -		musb_readl(&tx->tx_buf, 0),
> -		musb_readl(&tx->tx_current, 0),
> -		musb_readl(&tx->tx_buf_current, 0),
> -
> -		musb_readl(&tx->tx_info, 0),
> -		musb_readl(&tx->tx_rem_len, 0),
> +		"H%08x S%08x C%08x %08x, "
> +		"F%08x L%08x .. %08x" "\n",
> +		c->index, tag, musb_readw(c->hw_ep->regs, MUSB_TXCSR),
> +		musb_readl(c->state_ram, 0 * 4),	/* head ptr */
> +		musb_readl(c->state_ram, 1 * 4),	/* sop bd */
> +		musb_readl(c->state_ram, 2 * 4),	/* current bd */
> +		musb_readl(c->state_ram, 3 * 4),	/* buf offset */
> +		musb_readl(c->state_ram, 4 * 4),	/* flags */
> +		musb_readl(c->state_ram, 5 * 4),	/* len */
>  		/* dummy/unused word 6 */
> -		musb_readl(&tx->tx_complete, 0)
> -		);
> +		musb_readl(c->state_ram, 7 * 4)		/* completion */
> +	);
>  }
>  
>  /* Context: controller irqlocked */
>  static inline void
>  cppi_rndis_update(struct cppi_channel *c, int is_rx,
> -		void __iomem *tibase, int is_rndis)
> +		  void *__iomem tibase, int is_rndis)
>  {
>  	/* we may need to change the rndis flag for this cppi channel */
>  	if (c->is_rndis != is_rndis) {
> -		u32	value = musb_readl(tibase, DAVINCI_RNDIS_REG);
> -		u32	temp = 1 << (c->index);
> +		u32 regval = musb_readl(tibase, DAVINCI_RNDIS_REG);
> +		u32 temp = 1 << (c->index);
>  
>  		if (is_rx)
>  			temp <<= 16;
>  		if (is_rndis)
> -			value |= temp;
> +			regval |= temp;
>  		else
> -			value &= ~temp;
> -		musb_writel(tibase, DAVINCI_RNDIS_REG, value);
> +			regval &= ~temp;
> +		musb_writel(tibase, DAVINCI_RNDIS_REG, regval);
>  		c->is_rndis = is_rndis;
>  	}
>  }
>  
> +#if MUSB_DEBUG > 0
>  static void cppi_dump_rxbd(const char *tag, struct cppi_descriptor *bd)
>  {
>  	pr_debug("RXBD/%s %08x: "
> -			"nxt %08x buf %08x off.blen %08x opt.plen %08x\n",
> -			tag, bd->dma,
> -			bd->hw_next, bd->hw_bufp, bd->hw_off_len,
> -			bd->hw_options);
> +		 "nxt %08x buf %08x off.blen %08x opt.plen %08x\n",
> +		 tag, bd->dma,
> +		 bd->hw_next, bd->hw_bufp, bd->hw_off_len, bd->hw_options);
>  }
> +#endif
>  
>  static void cppi_dump_rxq(int level, const char *tag, struct cppi_channel *rx)
>  {
> @@ -447,17 +389,13 @@ static void cppi_dump_rxq(int level, const char *tag, struct cppi_channel *rx)
>  #endif
>  }
>  
> -
> -/* NOTE:  DaVinci autoreq is ignored except for host side "RNDIS" mode RX;
> - * so we won't ever use it (see "CPPI RX Woes" below).
> - */
>  static inline int cppi_autoreq_update(struct cppi_channel *rx,
> -		void __iomem *tibase, int onepacket, unsigned n_bds)
> +				      void *__iomem tibase, int shortpkt,
> +				      u8 rndis, signed n_bds, u8 startreq,
> +				      u8 endreq)
>  {
> -	u32	val;
> +	u32 tmp, val;
>  
> -#ifdef	RNDIS_RX_IS_USABLE
> -	u32	tmp;
>  	/* assert(is_host_active(musb)) */
>  
>  	/* start from "AutoReq never" */
> @@ -467,43 +405,28 @@ static inline int cppi_autoreq_update(struct cppi_channel *rx,
>  	/* HCD arranged reqpkt for packet #1.  we arrange int
>  	 * for all but the last one, maybe in two segments.
>  	 */
> -	if (!onepacket) {
> -#if 0
> -		/* use two segments, autoreq "all" then the last "never" */
> -		val |= ((0x3) << (rx->index * 2));
> -		n_bds--;
> -#else
> -		/* one segment, autoreq "all-but-last" */
> -		val |= ((0x1) << (rx->index * 2));
> -#endif
> +
> +	if (shortpkt && rndis) {
> +		val = (val | ((0x01) << (rx->index * 2)));
> +		rx->autoreq = 0x01;
> +	} else if (shortpkt && !rndis) {
> +		rx->autoreq = 0x00;
> +	} else if ((!shortpkt) && (n_bds > 2)) {
> +	/* there might be shortpacket not request we might convert in to
> +	 * RNDIS mode
> +	 */
> +		val = (val | ((0x03) << (rx->index * 2)));
> +		rx->autoreq = 0x03;
> +		if (endreq)
> +			n_bds -= 2;
> +	} else {
> +		rx->autoreq = 0;
>  	}
>  
>  	if (val != tmp) {
> -		int n = 100;
> -
> -		/* make sure that autoreq is updated before continuing */
>  		musb_writel(tibase, DAVINCI_AUTOREQ_REG, val);
> -		do {
> -			tmp = musb_readl(tibase, DAVINCI_AUTOREQ_REG);
> -			if (tmp == val)
> -				break;
> -			cpu_relax();
> -		} while (n-- > 0);
>  	}
> -#endif
> -
> -	/* REQPKT is turned off after each segment */
> -	if (n_bds && rx->channel.actual_len) {
> -		void __iomem	*regs = rx->hw_ep->regs;
>  
> -		val = musb_readw(regs, MUSB_RXCSR);
> -		if (!(val & MUSB_RXCSR_H_REQPKT)) {
> -			val |= MUSB_RXCSR_H_REQPKT | MUSB_RXCSR_H_WZC_BITS;
> -			musb_writew(regs, MUSB_RXCSR, val);
> -			/* flush writebufer */
> -			val = musb_readw(regs, MUSB_RXCSR);
> -		}
> -	}
>  	return n_bds;
>  }
>  
> @@ -536,88 +459,83 @@ static inline int cppi_autoreq_update(struct cppi_channel *rx,
>   * ========
>   * TX is a lot more reasonable than RX; it doesn't need to run in
>   * irq-per-packet mode very often.  RNDIS mode seems to behave too
> - * (except how it handles the exactly-N-packets case).  Building a
> + * (other how it handles the exactly-N-packets case).  Building a
>   * txdma queue with multiple requests (urb or usb_request) looks
> - * like it would work ... but fault handling would need much testing.
> - *
> - * The main issue with TX mode RNDIS relates to transfer lengths that
> - * are an exact multiple of the packet length.  It appears that there's
> - * a hiccup in that case (maybe the DMA completes before the ZLP gets
> - * written?) boiling down to not being able to rely on CPPI writing any
> - * terminating zero length packet before the next transfer is written.
> - * So that's punted to PIO; better yet, gadget drivers can avoid it.
> - *
> - * Plus, there's allegedly an undocumented constraint that rndis transfer
> - * length be a multiple of 64 bytes ... but the chip doesn't act that
> - * way, and we really don't _want_ that behavior anyway.
> - *
> - * On TX, "transparent" mode works ... although experiments have shown
> - * problems trying to use the SOP/EOP bits in different USB packets.
> - *
> - * REVISIT try to handle terminating zero length packets using CPPI
> - * instead of doing it by PIO after an IRQ.  (Meanwhile, make Ethernet
> - * links avoid that issue by forcing them to avoid zlps.)
> + * like it would work ... but fault handling still needs testing.
>   */
>  static void
> -cppi_next_tx_segment(struct musb *musb, struct cppi_channel *tx)
> +cppi_next_tx_segment(struct musb *musb, struct cppi_channel *tx, int rndis)
>  {
> -	unsigned		maxpacket = tx->maxpacket;
> -	dma_addr_t		addr = tx->buf_dma + tx->offset;
> -	size_t			length = tx->buf_len - tx->offset;
> -	struct cppi_descriptor	*bd;
> -	unsigned		n_bds;
> -	unsigned		i;
> -	struct cppi_tx_stateram	__iomem *tx_ram = tx->state_ram;
> -	int			rndis;
> +	unsigned maxpacket = tx->maxpacket;
> +	size_t length = tx->buf_len - tx->offset;
> +	struct cppi_descriptor *bd;
> +	unsigned n_bds;
> +	unsigned i, iso = (tx->rxmode == 2)? 1 : 0;
> +	int	iso_desc = tx->iso_desc;
> +	struct cppi_tx_stateram *txstate = tx->state_ram;
>  
>  	/* TX can use the CPPI "rndis" mode, where we can probably fit this
> -	 * transfer in one BD and one IRQ.  The only time we would NOT want
> -	 * to use it is when hardware constraints prevent it, or if we'd
> -	 * trigger the "send a ZLP?" confusion.
> +	 * transfer in one BD and one IRQ; though some common cases (like
> +	 * packet length not being n*64 bytes) can't work that way.
> +	 *
> +	 * To cppi hardware (but not the RNDIS protocol!) RNDIS is mostly a
> +	 * "short packet termination" mode.  So the only time we would NOT
> +	 * want to use it is to avoid sending spurious zero length packets,
> +	 * or when hardware constraints prevent it.
>  	 */
> -	rndis = (maxpacket & 0x3f) == 0
> -		&& length < 0xffff
> -		&& (length % maxpacket) != 0;
>  
> -	if (rndis) {
> +#if 0				/* Disable RNDIS on both Tx & Rx*/
> +	if (!rndis && (length % maxpacket) != 0)
> +		rndis = 1;
> +	if (rndis && (length > 0xffff || (maxpacket & 0x3f) != 0
> +		      /* "undocumented" rndis mode constraint on txlen */
> +		      || (length & 0x3f) != 0))
> +#endif
> +		rndis = 0;
> +
> +	if (iso) {
> +		n_bds = (tx->hw_ep->num_iso_desc - iso_desc)
> +				>= NUM_TXCHAN_BD ?  NUM_TXCHAN_BD :
> +				tx->hw_ep->num_iso_desc - iso_desc;
> +		tx->offset = tx->hw_ep->iso_desc[iso_desc].offset;
> +	} else if (!length) {
> +		rndis = 0;
> +		n_bds = 1;
> +	} else if (rndis) {
>  		maxpacket = length;
>  		n_bds = 1;
>  	} else {
>  		n_bds = length / maxpacket;
> -		if (!length || (length % maxpacket))
> +		if ((length % maxpacket) || (tx->rxmode == 1))
>  			n_bds++;
> -		n_bds = min(n_bds, (unsigned) NUM_TXCHAN_BD);
> +
> +		n_bds = min(n_bds, (unsigned)NUM_TXCHAN_BD);
>  		length = min(n_bds * maxpacket, length);
>  	}
>  
> -	DBG(4, "TX DMA%d, pktSz %d %s bds %d dma 0x%x len %u\n",
> -			tx->index,
> -			maxpacket,
> -			rndis ? "rndis" : "transparent",
> -			n_bds,
> -			addr, length);
> +	DBG(4, "TX DMA%d, pktsz %d %s bds %d dma 0x%x len %u\n",
> +	    tx->index,
> +	    maxpacket,
> +	    rndis ? "rndis" : "transparent",
> +	    n_bds, tx->buf_dma + tx->offset, length);
>  
>  	cppi_rndis_update(tx, 0, musb->ctrl_base, rndis);
>  
> -	/* assuming here that channel_program is called during
> +	/* assuming here that DmaProgramChannel is called during
>  	 * transfer initiation ... current code maintains state
>  	 * for one outstanding request only (no queues, not even
>  	 * the implicit ones of an iso urb).
>  	 */
>  
>  	bd = tx->freelist;
> -	tx->head = bd;
> +	tx->head = tx->freelist;
>  	tx->last_processed = NULL;
>  
> -	/* FIXME use BD pool like RX side does, and just queue
> -	 * the minimum number for this request.
> -	 */
> -
>  	/* Prepare queue of BDs first, then hand it to hardware.
>  	 * All BDs except maybe the last should be of full packet
>  	 * size; for RNDIS there _is_ only that last packet.
>  	 */
> -	for (i = 0; i < n_bds; ) {
> +	for (i = 0; i < n_bds;) {
>  		if (++i < n_bds && bd->next)
>  			bd->hw_next = bd->next->dma;
>  		else
> @@ -625,31 +543,42 @@ cppi_next_tx_segment(struct musb *musb, struct cppi_channel *tx)
>  
>  		bd->hw_bufp = tx->buf_dma + tx->offset;
>  
> -		/* FIXME set EOP only on the last packet,
> -		 * SOP only on the first ... avoid IRQs
> -		 */
> -		if ((tx->offset + maxpacket) <= tx->buf_len) {
> +		if (iso) {
> +			bd->hw_off_len = tx->hw_ep->iso_desc[iso_desc].length;
> +			bd->hw_options = CPPI_SOP_SET|CPPI_EOP_SET|CPPI_OWN_SET;
> +			if (tx->hw_ep->iso_desc[iso_desc].length == 0)
> +				bd->hw_options |= CPPI_ZERO_SET|1;
> +			else
> +				bd->hw_options |= tx->hw_ep->
> +						iso_desc[iso_desc].length;
> +
> +			tx->offset =  tx->hw_ep->
> +						iso_desc[++iso_desc].offset;
> +		} else if ((tx->offset + maxpacket)
> +		    <= tx->buf_len) {
>  			tx->offset += maxpacket;
>  			bd->hw_off_len = maxpacket;
>  			bd->hw_options = CPPI_SOP_SET | CPPI_EOP_SET
> -				| CPPI_OWN_SET | maxpacket;
> +			    | CPPI_OWN_SET | maxpacket;
>  		} else {
>  			/* only this one may be a partial USB Packet */
> -			u32		partial_len;
> +			u32 buffsz;
>  
> -			partial_len = tx->buf_len - tx->offset;
> +			buffsz = tx->buf_len - tx->offset;
>  			tx->offset = tx->buf_len;
> -			bd->hw_off_len = partial_len;
> +			bd->hw_off_len = (buffsz) ? buffsz : 1;
>  
>  			bd->hw_options = CPPI_SOP_SET | CPPI_EOP_SET
> -				| CPPI_OWN_SET | partial_len;
> -			if (partial_len == 0)
> +			    | CPPI_OWN_SET | (buffsz ? buffsz : 1);
> +			if (buffsz == 0) {
> +				tx->hw_ep->zero = 1;
>  				bd->hw_options |= CPPI_ZERO_SET;
> +			}
>  		}
>  
>  		DBG(5, "TXBD %p: nxt %08x buf %08x len %04x opt %08x\n",
> -				bd, bd->hw_next, bd->hw_bufp,
> -				bd->hw_off_len, bd->hw_options);
> +			bd, bd->hw_next, bd->hw_bufp, bd->hw_off_len,
> +			bd->hw_options);
>  
>  		/* update the last BD enqueued to the list */
>  		tx->tail = bd;
> @@ -657,17 +586,17 @@ cppi_next_tx_segment(struct musb *musb, struct cppi_channel *tx)
>  	}
>  
>  	/* BDs live in DMA-coherent memory, but writes might be pending */
> -	cpu_drain_writebuffer();
> +	/*cpu_drain_writebuffer();*/
>  
> -	/* Write to the HeadPtr in state RAM to trigger */
> -	musb_writel(&tx_ram->tx_head, 0, (u32)tx->freelist->dma);
> +	/* Write to the HeadPtr in StateRam to trigger */
> +	txstate->tx_head = (u32) tx->freelist->dma;
>  
>  	cppi_dump_tx(5, tx, "/S");
>  }
>  
>  /*
> - * CPPI RX Woes:
> - * =============
> + * CPPI RX:
> + * ========
>   * Consider a 1KB bulk RX buffer in two scenarios:  (a) it's fed two 300 byte
>   * packets back-to-back, and (b) it's fed two 512 byte packets back-to-back.
>   * (Full speed transfers have similar scenarios.)
> @@ -676,16 +605,14 @@ cppi_next_tx_segment(struct musb *musb, struct cppi_channel *tx)
>   * and the next packet goes into a buffer that's queued later; while (b) fills
>   * the buffer with 1024 bytes.  How to do that with CPPI?
>   *
> - * - RX queues in "rndis" mode -- one single BD -- handle (a) correctly, but
> - *   (b) loses **BADLY** because nothing (!) happens when that second packet
> + * - CPPI RX queues in "rndis" mode -- one single BD -- handle (a) correctly,
> + *   but (b) loses _badly_ because nothing (!) happens when that second packet
>   *   fills the buffer, much less when a third one arrives.  (Which makes this
>   *   not a "true" RNDIS mode.  In the RNDIS protocol short-packet termination
> - *   is optional, and it's fine if peripherals -- not hosts! -- pad messages
> - *   out to end-of-buffer.  Standard PCI host controller DMA descriptors
> - *   implement that mode by default ... which is no accident.)
> + *   is optional, and it's fine if senders pad messages out to end-of-buffer.)
>   *
> - * - RX queues in "transparent" mode -- two BDs with 512 bytes each -- have
> - *   converse problems:  (b) is handled right, but (a) loses badly.  CPPI RX
> + * - CPPI RX queues in "transparent" mode -- two BDs with 512 bytes each -- have
> + *   converse problems:  (b) is handled correctly, but (a) loses badly.  CPPI RX
>   *   ignores SOP/EOP markings and processes both of those BDs; so both packets
>   *   are loaded into the buffer (with a 212 byte gap between them), and the next
>   *   buffer queued will NOT get its 300 bytes of data. (It seems like SOP/EOP
> @@ -703,126 +630,91 @@ cppi_next_tx_segment(struct musb *musb, struct cppi_channel *tx)
>   * of that garbaged datastream.
>   *
>   * But there seems to be no way to identify the cases where CPPI RNDIS mode
> - * is appropriate -- which do NOT include RNDIS host drivers, but do include
> + * is appropriate -- which do NOT include the RNDIS driver, but do include
>   * the CDC Ethernet driver! -- and the documentation is incomplete/wrong.
> - * So we can't _ever_ use RX RNDIS mode ... except by using a heuristic
> - * that applies best on the peripheral side (and which could fail rudely).
> + * So we can't _ever_ use RX RNDIS mode.
>   *
>   * Leaving only "transparent" mode; we avoid multi-bd modes in almost all
> - * cases other than mass storage class.  Otherwise we're correct but slow,
> + * cases other than mass storage class.  Otherwise e're correct but slow,
>   * since CPPI penalizes our need for a "true RNDIS" default mode.
>   */
>  
> -
> -/* Heuristic, intended to kick in for ethernet/rndis peripheral ONLY
> - *
> - * IFF
> - *  (a)	peripheral mode ... since rndis peripherals could pad their
> - *	writes to hosts, causing i/o failure; or we'd have to cope with
> - *	a largely unknowable variety of host side protocol variants
> - *  (b)	and short reads are NOT errors ... since full reads would
> - *	cause those same i/o failures
> - *  (c)	and read length is
> - *	- less than 64KB (max per cppi descriptor)
> - *	- not a multiple of 4096 (g_zero default, full reads typical)
> - *	- N (>1) packets long, ditto (full reads not EXPECTED)
> - * THEN
> - *   try rx rndis mode
> - *
> - * Cost of heuristic failing:  RXDMA wedges at the end of transfers that
> - * fill out the whole buffer.  Buggy host side usb network drivers could
> - * trigger that, but "in the field" such bugs seem to be all but unknown.
> - *
> - * So this module parameter lets the heuristic be disabled.  When using
> - * gadgetfs, the heuristic will probably need to be disabled.
> - */
> -static int cppi_rx_rndis = 1;
> -
> -module_param(cppi_rx_rndis, bool, 0);
> -MODULE_PARM_DESC(cppi_rx_rndis, "enable/disable RX RNDIS heuristic");
> -
> -
>  /**
>   * cppi_next_rx_segment - dma read for the next chunk of a buffer
>   * @musb: the controller
>   * @rx: dma channel
>   * @onepacket: true unless caller treats short reads as errors, and
> - *	performs fault recovery above usbcore.
> + *	 performs fault recovery above usbcore.
>   * Context: controller irqlocked
>   *
>   * See above notes about why we can't use multi-BD RX queues except in
>   * rare cases (mass storage class), and can never use the hardware "rndis"
> - * mode (since it's not a "true" RNDIS mode) with complete safety..
> + * mode (since it's not a "true" RNDIS mode).
>   *
>   * It's ESSENTIAL that callers specify "onepacket" mode unless they kick in
>   * code to recover from corrupted datastreams after each short transfer.
>   */
>  static void
> -cppi_next_rx_segment(struct musb *musb, struct cppi_channel *rx, int onepacket)
> +cppi_next_rx_segment(struct musb *musb, struct cppi_channel *rx, int shortpkt)
>  {
> -	unsigned		maxpacket = rx->maxpacket;
> -	dma_addr_t		addr = rx->buf_dma + rx->offset;
> -	size_t			length = rx->buf_len - rx->offset;
> -	struct cppi_descriptor	*bd, *tail;
> -	unsigned		n_bds;
> -	unsigned		i;
> -	void __iomem		*tibase = musb->ctrl_base;
> -	int			is_rndis = 0;
> -	struct cppi_rx_stateram	__iomem *rx_ram = rx->state_ram;
> -
> -	if (onepacket) {
> -		/* almost every USB driver, host or peripheral side */
> -		n_bds = 1;
> -
> -		/* maybe apply the heuristic above */
> -		if (cppi_rx_rndis
> -				&& is_peripheral_active(musb)
> -				&& length > maxpacket
> -				&& (length & ~0xffff) == 0
> -				&& (length & 0x0fff) != 0
> -				&& (length & (maxpacket - 1)) == 0) {
> -			maxpacket = length;
> -			is_rndis = 1;
> -		}
> -	} else {
> -		/* virtually nothing except mass storage class */
> -		if (length > 0xffff) {
> -			n_bds = 0xffff / maxpacket;
> -			length = n_bds * maxpacket;
> +	unsigned maxpacket = rx->maxpacket;
> +	dma_addr_t addr = rx->buf_dma + rx->offset;
> +	size_t length = rx->buf_len - rx->offset;
> +	struct cppi_descriptor *bd, *tail;
> +	signed n_bds;
> +	signed i;
> +	void *__iomem tibase = musb->ctrl_base;
> +	u8 rndis = 0;
> +	int max_bd = 0;
> +	unsigned iso = (rx->rxmode == 2)? 1 : 0;
> +	int     iso_desc = rx->iso_desc;
> +
> +	if (((rx->rxmode == 1) && ((maxpacket & 0x3f) == 0)
> +		/*REVISIT MAXPACKET CHECK!!!!*/
> +	     && ((length & 0x3f) == 0))) {
> +		rndis = 1;
> +		max_bd = 65536;	/* MAX buffer aize per RNDIS BD is 64 K */
>  		} else {
> -			n_bds = length / maxpacket;
> -			if (length % maxpacket)
> -				n_bds++;
> -		}
> -		if (n_bds == 1)
> -			onepacket = 1;
> -		else
> -			n_bds = min(n_bds, (unsigned) NUM_RXCHAN_BD);
> +		rndis = 0;
> +		max_bd = maxpacket;
>  	}
> +#ifdef CONFIG_ARCH_DAVINCI
> +	/* Do not use RNDIS dma for DaVinci */
> +	rndis = 0;
> +	max_bd = maxpacket;
> +#endif
> +	if (iso)
> +		max_bd = rx->hw_ep->iso_desc[iso_desc].length;
> +
> +	n_bds = length / max_bd;
> +	if (length % max_bd)
> +		n_bds++;
> +
> +	n_bds = min(n_bds, (signed)NUM_RXCHAN_BD);
> +	if (n_bds == NUM_RXCHAN_BD)
> +		length = min(length, (size_t) (n_bds * max_bd));
> +
> +	cppi_rndis_update(rx, 1, musb->ctrl_base, rndis);
>  
>  	/* In host mode, autorequest logic can generate some IN tokens; it's
>  	 * tricky since we can't leave REQPKT set in RXCSR after the transfer
>  	 * finishes. So:  multipacket transfers involve two or more segments.
> -	 * And always at least two IRQs ... RNDIS mode is not an option.
> +	 * And always at least two IRQs.
>  	 */
>  	if (is_host_active(musb))
> -		n_bds = cppi_autoreq_update(rx, tibase, onepacket, n_bds);
> -
> -	cppi_rndis_update(rx, 1, musb->ctrl_base, is_rndis);
> -
> -	length = min(n_bds * maxpacket, length);
> -
> -	DBG(4, "RX DMA%d seg, maxp %d %s bds %d (cnt %d) "
> -			"dma 0x%x len %u %u/%u\n",
> -			rx->index, maxpacket,
> -			onepacket
> -				? (is_rndis ? "rndis" : "onepacket")
> -				: "multipacket",
> -			n_bds,
> -			musb_readl(tibase,
> -				DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4))
> -					& 0xffff,
> -			addr, length, rx->channel.actual_len, rx->buf_len);
> +		n_bds = cppi_autoreq_update(rx, tibase, shortpkt, rndis, n_bds,
> +					    (u8) !rx->offset,
> +					    (u8) (rx->buf_len <=
> +						  (rx->offset + length)
> +					    ));
> +
> +	DBG(4, "RX DMA%d seg, maxp %d %spacket bds %d (cnt %d) "
> +	    "dma 0x%x len %u/%u/%u\n",
> +	    rx->index, max_bd,
> +	    shortpkt ? "one" : "multi",
> +	    n_bds,
> +	    musb_readl(tibase, DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4))
> +	    & 0xffff, addr, length, rx->actuallen, rx->buf_len);
>  
>  	/* only queue one segment at a time, since the hardware prevents
>  	 * correct queue shutdown after unexpected short packets
> @@ -832,7 +724,7 @@ cppi_next_rx_segment(struct musb *musb, struct cppi_channel *rx, int onepacket)
>  
>  	/* Build BDs for all packets in this segment */
>  	for (i = 0, tail = NULL; bd && i < n_bds; i++, tail = bd) {
> -		u32	bd_len;
> +		u32 buffsz;
>  
>  		if (i) {
>  			bd = cppi_bd_alloc(rx);
> @@ -844,20 +736,20 @@ cppi_next_rx_segment(struct musb *musb, struct cppi_channel *rx, int onepacket)
>  		bd->hw_next = 0;
>  
>  		/* all but the last packet will be maxpacket size */
> -		if (maxpacket < length)
> -			bd_len = maxpacket;
> +		if (max_bd < length)
> +			buffsz = max_bd;
>  		else
> -			bd_len = length;
> +			buffsz = length;
>  
>  		bd->hw_bufp = addr;
> -		addr += bd_len;
> -		rx->offset += bd_len;
> +		addr += buffsz;
> +		rx->offset += buffsz;
>  
> -		bd->hw_off_len = (0 /*offset*/ << 16) + bd_len;
> -		bd->buflen = bd_len;
> +		bd->hw_off_len = (0 /*offset */  << 16) + buffsz;
> +		bd->buflen = buffsz;
>  
>  		bd->hw_options = CPPI_OWN_SET | (i == 0 ? length : 0);
> -		length -= bd_len;
> +		length -= buffsz;
>  	}
>  
>  	/* we always expect at least one reusable BD! */
> @@ -872,65 +764,39 @@ cppi_next_rx_segment(struct musb *musb, struct cppi_channel *rx, int onepacket)
>  
>  	bd = rx->head;
>  	rx->tail = tail;
> -
> +#if 0
>  	/* short reads and other faults should terminate this entire
>  	 * dma segment.  we want one "dma packet" per dma segment, not
>  	 * one per USB packet, terminating the whole queue at once...
>  	 * NOTE that current hardware seems to ignore SOP and EOP.
>  	 */
> -	bd->hw_options |= CPPI_SOP_SET;
> -	tail->hw_options |= CPPI_EOP_SET;
> -
> -	if (debug >= 5) {
> -		struct cppi_descriptor	*d;
> +	if (MGC_DebugLevel >= 5) {
> +		struct cppi_descriptor *d;
>  
>  		for (d = rx->head; d; d = d->next)
>  			cppi_dump_rxbd("S", d);
>  	}
> -
> -	/* in case the preceding transfer left some state... */
> -	tail = rx->last_processed;
> -	if (tail) {
> -		tail->next = bd;
> -		tail->hw_next = bd->dma;
> -	}
> -
> -	core_rxirq_enable(tibase, rx->index + 1);
> -
> +#endif
> +	rx->last_processed = NULL;
>  	/* BDs live in DMA-coherent memory, but writes might be pending */
> -	cpu_drain_writebuffer();
> +	/*cpu_drain_writebuffer();*/
>  
>  	/* REVISIT specs say to write this AFTER the BUFCNT register
>  	 * below ... but that loses badly.
>  	 */
> -	musb_writel(&rx_ram->rx_head, 0, bd->dma);
> +	musb_writel(rx->state_ram, 4, bd->dma);
>  
>  	/* bufferCount must be at least 3, and zeroes on completion
>  	 * unless it underflows below zero, or stops at two, or keeps
>  	 * growing ... grr.
>  	 */
> -	i = musb_readl(tibase,
> -			DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4))
> -			& 0xffff;
> +	i = musb_readl(tibase, DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4))
> +	    & 0xffff;
>  
> -	if (!i)
> -		musb_writel(tibase,
> -			DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4),
> -			n_bds + 2);
> -	else if (n_bds > (i - 3))
> +	if (n_bds > (i - 2)) {
>  		musb_writel(tibase,
> -			DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4),
> -			n_bds - (i - 3));
> -
> -	i = musb_readl(tibase,
> -			DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4))
> -			& 0xffff;
> -	if (i < (2 + n_bds)) {
> -		DBG(2, "bufcnt%d underrun - %d (for %d)\n",
> -					rx->index, i, n_bds);
> -		musb_writel(tibase,
> -			DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4),
> -			n_bds + 2);
> +			    DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4),
> +			    n_bds - i + 2);
>  	}
>  
>  	cppi_dump_rx(4, rx, "/S");
> @@ -938,46 +804,41 @@ cppi_next_rx_segment(struct musb *musb, struct cppi_channel *rx, int onepacket)
>  
>  /**
>   * cppi_channel_program - program channel for data transfer
> - * @ch: the channel
> - * @maxpacket: max packet size
> + * @pChannel: the channel
> + * @wPacketsz: max packet size
>   * @mode: For RX, 1 unless the usb protocol driver promised to treat
> - *	all short reads as errors and kick in high level fault recovery.
> - *	For TX, ignored because of RNDIS mode races/glitches.
> + *	 all short reads as errors and kick in high level fault recovery.
> + *	 For TX, 0 unless the protocol driver _requires_ short-packet
> + *	 termination mode.
>   * @dma_addr: dma address of buffer
> - * @len: length of buffer
> + * @dwLength: length of buffer
>   * Context: controller irqlocked
>   */
> -static int cppi_channel_program(struct dma_channel *ch,
> -		u16 maxpacket, u8 mode,
> -		dma_addr_t dma_addr, u32 len)
> +static int  cppi_channel_program(struct dma_channel *ch,
> +			       u16 maxpacket, u8 mode,
> +			       dma_addr_t dma_addr, u32 len)
>  {
> -	struct cppi_channel	*cppi_ch;
> -	struct cppi		*controller;
> -	struct musb		*musb;
> -
> -	cppi_ch = container_of(ch, struct cppi_channel, channel);
> -	controller = cppi_ch->controller;
> -	musb = controller->musb;
> +	struct cppi_channel *cppi_ch = container_of(ch, struct cppi_channel,
> +						channel);
> +	struct cppi *controller = cppi_ch->controller;
> +	struct musb *musb = controller->musb;
>  
>  	switch (ch->status) {
>  	case MUSB_DMA_STATUS_BUS_ABORT:
>  	case MUSB_DMA_STATUS_CORE_ABORT:
>  		/* fault irq handler should have handled cleanup */
>  		WARN("%cX DMA%d not cleaned up after abort!\n",
> -				cppi_ch->transmit ? 'T' : 'R',
> -				cppi_ch->index);
> -		/* WARN_ON(1); */
> +		     cppi_ch->transmit ? 'T' : 'R', cppi_ch->index);
> +		/*WARN_ON(1);*/
>  		break;
>  	case MUSB_DMA_STATUS_BUSY:
>  		WARN("program active channel?  %cX DMA%d\n",
> -				cppi_ch->transmit ? 'T' : 'R',
> -				cppi_ch->index);
> -		/* WARN_ON(1); */
> +		     cppi_ch->transmit ? 'T' : 'R', cppi_ch->index);
> +		/*WARN_ON(1);*/
>  		break;
>  	case MUSB_DMA_STATUS_UNKNOWN:
>  		DBG(1, "%cX DMA%d not allocated!\n",
> -				cppi_ch->transmit ? 'T' : 'R',
> -				cppi_ch->index);
> +		    cppi_ch->transmit ? 'T' : 'R', cppi_ch->index);
>  		/* FALLTHROUGH */
>  	case MUSB_DMA_STATUS_FREE:
>  		break;
> @@ -989,146 +850,118 @@ static int cppi_channel_program(struct dma_channel *ch,
>  	cppi_ch->buf_dma = dma_addr;
>  	cppi_ch->offset = 0;
>  	cppi_ch->maxpacket = maxpacket;
> +	cppi_ch->actuallen = 0;
>  	cppi_ch->buf_len = len;
> +	cppi_ch->rxmode = mode;
> +	cppi_ch->reqcomplete = 0;
> +	cppi_ch->autoreq = 0;
> +	cppi_ch->iso_desc = 0;
>  
>  	/* TX channel? or RX? */
>  	if (cppi_ch->transmit)
> -		cppi_next_tx_segment(musb, cppi_ch);
> +		cppi_next_tx_segment(musb, cppi_ch, mode);
>  	else
> -		cppi_next_rx_segment(musb, cppi_ch, mode);
> +		cppi_next_rx_segment(musb, cppi_ch , mode);
>  
>  	return true;
>  }
>  
> -static bool cppi_rx_scan(struct cppi *cppi, unsigned ch)
> +static int cppi_rx_scan(struct cppi *cppi, unsigned ch, u8 abort)
>  {
> -	struct cppi_channel		*rx = &cppi->rx[ch];
> -	struct cppi_rx_stateram __iomem	*state = rx->state_ram;
> -	struct cppi_descriptor		*bd;
> -	struct cppi_descriptor		*last = rx->last_processed;
> -	bool				completed = false;
> -	bool				acked = false;
> -	int				i;
> -	dma_addr_t			safe2ack;
> -	void __iomem			*regs = rx->hw_ep->regs;
> +	struct cppi_channel *rx = &cppi->rx[ch];
> +	struct cppi_rx_stateram *state = rx->state_ram;
> +	struct cppi_descriptor *bd;
> +	struct cppi_descriptor *last = rx->last_processed;
> +	int completed = 0;
> +	dma_addr_t safe2ack;
> +	u32 csr;
>  
>  	cppi_dump_rx(6, rx, "/K");
>  
> -	bd = last ? last->next : rx->head;
> -	if (!bd)
> -		return false;
> -
> -	/* run through all completed BDs */
> -	for (i = 0, safe2ack = musb_readl(&state->rx_complete, 0);
> -			(safe2ack || completed) && bd && i < NUM_RXCHAN_BD;
> -			i++, bd = bd->next) {
> -		u16	len;
> +	if (abort) {
> +		safe2ack = musb_readl(CAST & state->rx_complete, 0);
>  
> -		rmb();
> -		if (!completed && (bd->hw_options & CPPI_OWN_SET))
> -			break;
> -
> -		DBG(5, "C/RXBD %08x: nxt %08x buf %08x "
> -			"off.len %08x opt.len %08x (%d)\n",
> -			bd->dma, bd->hw_next, bd->hw_bufp,
> -			bd->hw_off_len, bd->hw_options,
> -			rx->channel.actual_len);
> -
> -		/* actual packet received length */
> -		if ((bd->hw_options & CPPI_SOP_SET) && !completed)
> -			len = bd->hw_off_len & CPPI_RECV_PKTLEN_MASK;
> -		else
> -			len = 0;
> -
> -		if (bd->hw_options & CPPI_EOQ_MASK)
> -			completed = true;
> -
> -		if (!completed && len < bd->buflen) {
> -			/* NOTE:  when we get a short packet, RXCSR_H_REQPKT
> -			 * must have been cleared, and no more DMA packets may
> -			 * active be in the queue... TI docs didn't say, but
> -			 * CPPI ignores those BDs even though OWN is still set.
> -			 */
> -			completed = true;
> -			DBG(3, "rx short %d/%d (%d)\n",
> -					len, bd->buflen,
> -					rx->channel.actual_len);
> +		if ((last == NULL) || (safe2ack == last->dma)) {
> +			if (last == NULL)
> +				last = rx->head;
> +			goto free;
>  		}
> +	}
>  
> -		/* If we got here, we expect to ack at least one BD; meanwhile
> -		 * CPPI may completing other BDs while we scan this list...
> -		 *
> -		 * RACE: we can notice OWN cleared before CPPI raises the
> -		 * matching irq by writing that BD as the completion pointer.
> -		 * In such cases, stop scanning and wait for the irq, avoiding
> -		 * lost acks and states where BD ownership is unclear.
> -		 */
> -		if (bd->dma == safe2ack) {
> -			musb_writel(&state->rx_complete, 0, safe2ack);
> -			safe2ack = musb_readl(&state->rx_complete, 0);
> -			acked = true;
> -			if (bd->dma == safe2ack)
> -				safe2ack = 0;
> -		}
> +	bd = last ? last->next : rx->head;
>  
> -		rx->channel.actual_len += len;
> +	do {
> +		safe2ack = musb_readl(CAST & state->rx_complete, 0);
> +		if (completed)
> +			break;
> +		do {
> +			u16 len;
> +			/*rmb();*/
> +			DBG(5, "C/RXBD %08x: nxt %08x buf %08x "
> +			    "off.len %08x opt.len %08x (%d)\n",
> +			    bd->dma, bd->hw_next, bd->hw_bufp,
> +			    bd->hw_off_len, bd->hw_options, rx->actuallen);
> +
> +			/* actual packet received length */
> +			len = bd->hw_off_len & CPPI_RECV_PKTLEN_MASK;
> +			if (bd->hw_options & CPPI_ZERO_SET)
> +				len = 0;
>  
> -		cppi_bd_free(rx, last);
> -		last = bd;
> +			if (bd->hw_next == 0)
> +				completed = 1;
>  
> -		/* stop scanning on end-of-segment */
> -		if (bd->hw_next == 0)
> -			completed = true;
> -	}
> -	rx->last_processed = last;
> +			if ((len < bd->buflen) && (rx->rxmode != 2)) {
> +				rx->reqcomplete = 1;
> +				completed = 1;
> +				DBG(3, "rx short %d/%d (%d)\n",
> +				    len, bd->buflen, rx->actuallen);
> +			}
>  
> -	/* dma abort, lost ack, or ... */
> -	if (!acked && last) {
> -		int	csr;
> +			if (rx->rxmode == 2) {
> +				rx->hw_ep->iso_desc[rx->iso_desc].length = len;
> +				if (completed) {
> +					rx->reqcomplete = 1;
> +					rx->iso_desc = 0;
> +				} else
> +					rx->iso_desc++;
> +			}
>  
> -		if (safe2ack == 0 || safe2ack == rx->last_processed->dma)
> -			musb_writel(&state->rx_complete, 0, safe2ack);
> -		if (safe2ack == 0) {
> +			rx->actuallen += len;
>  			cppi_bd_free(rx, last);
> -			rx->last_processed = NULL;
> -
> -			/* if we land here on the host side, H_REQPKT will
> -			 * be clear and we need to restart the queue...
> -			 */
> -			WARN_ON(rx->head);
> -		}
> -		musb_ep_select(cppi->mregs, rx->index + 1);
> -		csr = musb_readw(regs, MUSB_RXCSR);
> -		if (csr & MUSB_RXCSR_DMAENAB) {
> -			DBG(4, "list%d %p/%p, last %08x%s, csr %04x\n",
> -				rx->index,
> -				rx->head, rx->tail,
> -				rx->last_processed
> -					? rx->last_processed->dma
> -					: 0,
> -				completed ? ", completed" : "",
> -				csr);
> -			cppi_dump_rxq(4, "/what?", rx);
> -		}
> +			last = bd;
> +			if (safe2ack == bd->dma) {
> +				bd = bd->next;
> +				break;
> +			}
> +			bd = bd->next;
> +		} while (!completed);
> +	} while (musb_readl(CAST & state->rx_complete, 0) != safe2ack);
> +
> +	if (is_host_active(rx->controller->musb) && (!abort) &&
> +		(rx->autoreq == 0) &&
> +		((!completed) || (completed && (rx->reqcomplete == 0) &&
> +		(rx->actuallen != rx->buf_len)))) {
> +		csr = musb_readw(rx->hw_ep->regs, MUSB_RXCSR);
> +		csr |= MUSB_RXCSR_H_REQPKT;
> +		musb_writew(rx->hw_ep->regs, MUSB_RXCSR, csr);
>  	}
> -	if (!completed) {
> -		int	csr;
> -
> -		rx->head = bd;
>  
> -		/* REVISIT seems like "autoreq all but EOP" doesn't...
> -		 * setting it here "should" be racey, but seems to work
> -		 */
> -		csr = musb_readw(rx->hw_ep->regs, MUSB_RXCSR);
> -		if (is_host_active(cppi->musb)
> -				&& bd
> -				&& !(csr & MUSB_RXCSR_H_REQPKT)) {
> -			csr |= MUSB_RXCSR_H_REQPKT;
> -			musb_writew(regs, MUSB_RXCSR,
> -					MUSB_RXCSR_H_WZC_BITS | csr);
> -			csr = musb_readw(rx->hw_ep->regs, MUSB_RXCSR);
> +	rx->last_processed = last;
> +	musb_writel(CAST & state->rx_complete, 0, safe2ack);
> +free:
> +	if (completed || abort) {
> +
> +		/* Flush BD's not consumed */
> +		while (last != NULL) {
> +			bd = last;
> +			last = last->next;
> +			cppi_bd_free(rx, bd);
>  		}
> -	} else {
> +		if (abort) {
> +			safe2ack = musb_readl(CAST & state->rx_complete, 0);
> +			musb_writel(CAST & state->rx_complete, 0, safe2ack);
> +		}
> +		rx->last_processed = NULL;
>  		rx->head = NULL;
>  		rx->tail = NULL;
>  	}
> @@ -1139,146 +972,158 @@ static bool cppi_rx_scan(struct cppi *cppi, unsigned ch)
>  
>  void cppi_completion(struct musb *musb, u32 rx, u32 tx)
>  {
> -	void __iomem		*tibase;
> -	int			i, index;
> -	struct cppi		*cppi;
> -	struct musb_hw_ep	*hw_ep = NULL;
> +	void *__iomem tibase;
> +	int i, channum;
> +	u8 reqcomplete;
> +	struct cppi *cppi;
> +	struct cppi_descriptor *bdptr;
>  
>  	cppi = container_of(musb->dma_controller, struct cppi, controller);
> -
> -	tibase = musb->ctrl_base;
> +	tibase = cppi->mregs - DAVINCI_BASE_OFFSET;
>  
>  	/* process TX channels */
> -	for (index = 0; tx; tx = tx >> 1, index++) {
> -		struct cppi_channel		*tx_ch;
> -		struct cppi_tx_stateram __iomem	*tx_ram;
> -		bool				completed = false;
> -		struct cppi_descriptor		*bd;
> -
> -		if (!(tx & 1))
> -			continue;
> -
> -		tx_ch = cppi->tx + index;
> -		tx_ram = tx_ch->state_ram;
> -
> -		/* FIXME  need a cppi_tx_scan() routine, which
> -		 * can also be called from abort code
> -		 */
> -
> -		cppi_dump_tx(5, tx_ch, "/E");
> -
> -		bd = tx_ch->head;
> -
> -		if (NULL == bd) {
> -			DBG(1, "null BD\n");
> -			continue;
> -		}
> -
> -		/* run through all completed BDs */
> -		for (i = 0; !completed && bd && i < NUM_TXCHAN_BD;
> -				i++, bd = bd->next) {
> -			u16	len;
> -
> -			rmb();
> -			if (bd->hw_options & CPPI_OWN_SET)
> -				break;
> -
> -			DBG(5, "C/TXBD %p n %x b %x off %x opt %x\n",
> -					bd, bd->hw_next, bd->hw_bufp,
> -					bd->hw_off_len, bd->hw_options);
> +	for (channum = 0; tx; tx = tx >> 1, channum++) {
> +		if (tx & 1) {
> +			struct cppi_channel *txchannel;
> +			struct cppi_tx_stateram *txstate;
> +			u32 safe2ack = 0;
>  
> -			len = bd->hw_off_len & CPPI_BUFFER_LEN_MASK;
> -			tx_ch->channel.actual_len += len;
> +			txchannel = cppi->tx + channum;
> +			txstate = txchannel->state_ram;
>  
> -			tx_ch->last_processed = bd;
> -
> -			/* write completion register to acknowledge
> -			 * processing of completed BDs, and possibly
> -			 * release the IRQ; EOQ might not be set ...
> -			 *
> -			 * REVISIT use the same ack strategy as rx
> -			 *
> -			 * REVISIT have observed bit 18 set; huh??
> -			 */
> -			/* if ((bd->hw_options & CPPI_EOQ_MASK)) */
> -				musb_writel(&tx_ram->tx_complete, 0, bd->dma);
> -
> -			/* stop scanning on end-of-segment */
> -			if (bd->hw_next == 0)
> -				completed = true;
> -		}
> -
> -		/* on end of segment, maybe go to next one */
> -		if (completed) {
> -			/* cppi_dump_tx(4, tx_ch, "/complete"); */
> -
> -			/* transfer more, or report completion */
> -			if (tx_ch->offset >= tx_ch->buf_len) {
> -				tx_ch->head = NULL;
> -				tx_ch->tail = NULL;
> -				tx_ch->channel.status = MUSB_DMA_STATUS_FREE;
> -
> -				hw_ep = tx_ch->hw_ep;
> -
> -				/* Peripheral role never repurposes the
> -				 * endpoint, so immediate completion is
> -				 * safe.  Host role waits for the fifo
> -				 * to empty (TXPKTRDY irq) before going
> -				 * to the next queued bulk transfer.
> -				 */
> -				if (is_host_active(cppi->musb)) {
> -#if 0
> -					/* WORKAROUND because we may
> -					 * not always get TXKPTRDY ...
> +			/* FIXME  need a cppi_tx_scan() routine, which
> +			 * can also be called from abort code
>  					 */
> -					int	csr;
> -
> -					csr = musb_readw(hw_ep->regs,
> -						MUSB_TXCSR);
> -					if (csr & MUSB_TXCSR_TXPKTRDY)
> -#endif
> -						completed = false;
> -				}
> -				if (completed)
> -					musb_dma_completion(musb, index + 1, 1);
> -
> -			} else {
> -				/* Bigger transfer than we could fit in
> -				 * that first batch of descriptors...
> +			cppi_dump_tx(5, txchannel, "/E");
> +
> +			bdptr = txchannel->head;
> +			i = 0;
> +			reqcomplete = 0;
> +
> +			do {
> +				safe2ack = txstate->tx_complete;
> +				do {
> +					/*rmb();*/
> +					DBG(5, "C/TXBD %p n %x b %x off %x\
> +						opt %x\n",
> +						bdptr, bdptr->hw_next,
> +						bdptr->hw_bufp,
> +						bdptr->hw_off_len,
> +						bdptr->hw_options);
> +
> +					txchannel->actuallen += (u16) (bdptr->
> +							hw_off_len &
> +							CPPI_BUFFER_LEN_MASK);
> +					if (txchannel->rxmode == 2) {
> +						txchannel->hw_ep->
> +						iso_desc[txchannel->iso_desc].
> +						status = 0;
> +						txchannel->iso_desc++;
> +					}
> +					if (bdptr->dma == safe2ack) {
> +						if (bdptr->hw_options &
> +							CPPI_ZERO_SET)
> +							txchannel->actuallen -=
> +							1;
> +						if (bdptr->hw_next == 0)
> +							reqcomplete = 1;
> +						txchannel->last_processed =
> +						    bdptr;
> +						bdptr = bdptr->next;
> +						break;
> +					}
> +					bdptr = bdptr->next;
> +				} while (1);
> +			} while (txstate->tx_complete != safe2ack);
> +
> +			txstate->tx_complete = txchannel->last_processed->
> +						dma;
> +
> +			/* on end of segment, maybe go to next one */
> +			if (reqcomplete) {
> +				cppi_dump_tx(4, txchannel, "/complete");
> +
> +				/* transfer more, or report completion */
> +				if ((txchannel->actuallen
> +				    >= txchannel->buf_len)) {
> +					if ((txchannel->rxmode == 1) &&
> +						(!(txchannel->actuallen %
> +						txchannel->maxpacket)) &&
> +						(!txchannel->hw_ep->zero)) {
> +						cppi_next_tx_segment(musb,
> +							txchannel,
> +							txchannel->rxmode);
> +					} else {
> +						txchannel->hw_ep->zero = 0;
> +						txchannel->head =
> +							NULL;
> +						txchannel->tail =
> +							NULL;
> +						txchannel->channel.status =
> +						    MUSB_DMA_STATUS_FREE;
> +						txchannel->channel.
> +							actual_len =
> +						    txchannel->actuallen;
> +						musb_dma_completion(musb,
> +							channum + 1, 1);
> +					}
> +
> +				} else {
> +					/* Bigger transfer than we could fit in
> +					 * that first batch of descriptors...
>  				 */
> -				cppi_next_tx_segment(musb, tx_ch);
> +					cppi_next_tx_segment(musb,
> +							txchannel,
> +							txchannel->rxmode);
>  			}
> -		} else
> -			tx_ch->head = bd;
> +			} else
> +				txchannel->head = bdptr;
> +		}
>  	}
>  
>  	/* Start processing the RX block */
> -	for (index = 0; rx; rx = rx >> 1, index++) {
> +	for (channum = 0; rx; rx = rx >> 1, channum++) {
>  
>  		if (rx & 1) {
> -			struct cppi_channel		*rx_ch;
> -
> -			rx_ch = cppi->rx + index;
> +			struct cppi_channel *rxchannel;
> +
> +			rxchannel = cppi->rx + channum;
> +			/* There is a race condition between abort channel and
> +			 * on going traffic.  Ensure that pending interrupts
> +			 * (that were raised during clean up but was held due
> +			 * to interrupt block) are acknowledged correctly by
> +			 * adressing those interrupts as abort (from interrupt
> +			 * context, as we already handled the user initiated
> +			 * abort which actually lead to this race). This occurs
> +			 * possible due to the fact that we do not support
> +			 * clean RX teardown at this point (May not require this
> +			 * code once RX teardown is correctly supported in
> +			 * hardware).
> +			 */
> +			if (rxchannel->head)
> +				reqcomplete = cppi_rx_scan(cppi, channum, 0);
> +			else {
> +				cppi_rx_scan(cppi, channum, 1);
> +				continue;
> +			}
>  
>  			/* let incomplete dma segments finish */
> -			if (!cppi_rx_scan(cppi, index))
> +			if (!reqcomplete)
>  				continue;
>  
>  			/* start another dma segment if needed */
> -			if (rx_ch->channel.actual_len != rx_ch->buf_len
> -					&& rx_ch->channel.actual_len
> -						== rx_ch->offset) {
> -				cppi_next_rx_segment(musb, rx_ch, 1);
> +			if ((rxchannel->actuallen != rxchannel->buf_len)
> +			    && !(rxchannel->reqcomplete)) {
> +				cppi_next_rx_segment(musb, rxchannel,
> +						     rxchannel->rxmode);
>  				continue;
>  			}
>  
>  			/* all segments completed! */
> -			rx_ch->channel.status = MUSB_DMA_STATUS_FREE;
> -
> -			hw_ep = rx_ch->hw_ep;
> -
> -			core_rxirq_disable(tibase, index + 1);
> -			musb_dma_completion(musb, index + 1, 0);
> +			rxchannel->channel.status = MUSB_DMA_STATUS_FREE;
> +			rxchannel->channel.actual_len =
> +			    rxchannel->actuallen;
> +			musb_dma_completion(musb, channum + 1, 0);
>  		}
>  	}
>  
> @@ -1287,8 +1132,8 @@ void cppi_completion(struct musb *musb, u32 rx, u32 tx)
>  }
>  
>  /* Instantiate a software object representing a DMA controller. */
> -struct dma_controller *__init
> -dma_controller_create(struct musb *musb, void __iomem *mregs)
> +struct dma_controller *__init dma_controller_create(struct musb *musb,
> +						void __iomem *mregs)
>  {
>  	struct cppi		*controller;
>  
> @@ -1313,9 +1158,9 @@ dma_controller_create(struct musb *musb, void __iomem *mregs)
>  
>  	/* setup BufferPool */
>  	controller->pool = dma_pool_create("cppi",
> -			controller->musb->controller,
> -			sizeof(struct cppi_descriptor),
> -			CPPI_DESCRIPTOR_ALIGN, 0);
> +					    controller->musb->controller,
> +					    sizeof(struct cppi_descriptor),
> +					    CPPI_DESCRIPTOR_ALIGN, 0);
>  	if (!controller->pool) {
>  		kfree(controller);
>  		return NULL;
> @@ -1329,14 +1174,13 @@ dma_controller_create(struct musb *musb, void __iomem *mregs)
>   */
>  void dma_controller_destroy(struct dma_controller *c)
>  {
> -	struct cppi	*cppi;
> -
> -	cppi = container_of(c, struct cppi, controller);
> +	struct cppi *cpcontroller = container_of(c, struct cppi, controller);;
>  
>  	/* assert:  caller stopped the controller first */
> -	dma_pool_destroy(cppi->pool);
> +	dma_pool_destroy(cpcontroller->pool);
> +
> +	kfree(cpcontroller);
>  
> -	kfree(cppi);
>  }
>  
>  /*
> @@ -1344,17 +1188,13 @@ void dma_controller_destroy(struct dma_controller *c)
>   */
>  static int cppi_channel_abort(struct dma_channel *channel)
>  {
> -	struct cppi_channel	*cppi_ch;
> -	struct cppi		*controller;
> -	void __iomem		*mbase;
> -	void __iomem		*tibase;
> -	void __iomem		*regs;
> -	u32			value;
> -	struct cppi_descriptor	*queue;
> -
> -	cppi_ch = container_of(channel, struct cppi_channel, channel);
> -
> -	controller = cppi_ch->controller;
> +	struct cppi_channel *cppi_ch = container_of(channel,
> +						struct cppi_channel, channel);
> +	struct cppi *controller = cppi_ch->controller;
> +	int chnum = cppi_ch->index;
> +	void *__iomem mbase;
> +	void *__iomem tibase;
> +	u32 regval;
>  
>  	switch (channel->status) {
>  	case MUSB_DMA_STATUS_BUS_ABORT:
> @@ -1362,41 +1202,40 @@ static int cppi_channel_abort(struct dma_channel *channel)
>  		/* from RX or TX fault irq handler */
>  	case MUSB_DMA_STATUS_BUSY:
>  		/* the hardware needs shutting down */
> -		regs = cppi_ch->hw_ep->regs;
>  		break;
>  	case MUSB_DMA_STATUS_UNKNOWN:
> +		DBG(8, "%cX DMA%d not allocated\n",
> +		    cppi_ch->transmit ? 'T' : 'R', cppi_ch->index);
> +		/* FALLTHROUGH */
>  	case MUSB_DMA_STATUS_FREE:
>  		return 0;
> -	default:
> -		return -EINVAL;
>  	}
>  
> +	if (chnum & ~CPPI_CHNUM_BITS_MASK)
> +		return -EINVAL;
> +
>  	if (!cppi_ch->transmit && cppi_ch->head)
> -		cppi_dump_rxq(3, "/abort", cppi_ch);
> +		cppi_dump_rxq(4, "/abort", cppi_ch);
>  
>  	mbase = controller->mregs;
> -	tibase = controller->tibase;
> -
> -	queue = cppi_ch->head;
> -	cppi_ch->head = NULL;
> -	cppi_ch->tail = NULL;
> +	tibase = mbase - DAVINCI_BASE_OFFSET;
>  
>  	/* REVISIT should rely on caller having done this,
>  	 * and caller should rely on us not changing it.
>  	 * peripheral code is safe ... check host too.
>  	 */
> -	musb_ep_select(mbase, cppi_ch->index + 1);
> +	musb_ep_select(mbase, chnum + 1);
>  
>  	if (cppi_ch->transmit) {
> -		struct cppi_tx_stateram __iomem *tx_ram;
> -		int			enabled;
> +		struct cppi_tx_stateram *__iomem txstate;
> +		int enabled;
>  
>  		/* mask interrupts raised to signal teardown complete.  */
>  		enabled = musb_readl(tibase, DAVINCI_TXCPPI_INTENAB_REG)
> -				& (1 << cppi_ch->index);
> +		    & (1 << cppi_ch->index);
>  		if (enabled)
>  			musb_writel(tibase, DAVINCI_TXCPPI_INTCLR_REG,
> -					(1 << cppi_ch->index));
> +				    (1 << cppi_ch->index));
>  
>  		/* REVISIT put timeouts on these controller handshakes */
>  
> @@ -1404,43 +1243,55 @@ static int cppi_channel_abort(struct dma_channel *channel)
>  
>  		/* teardown DMA engine then usb core */
>  		do {
> -			value = musb_readl(tibase, DAVINCI_TXCPPI_TEAR_REG);
> -		} while (!(value & CPPI_TEAR_READY));
> -		musb_writel(tibase, DAVINCI_TXCPPI_TEAR_REG, cppi_ch->index);
> +			regval = musb_readl(tibase, DAVINCI_TXCPPI_TEAR_REG);
> +		} while (!(regval & CPPI_TEAR_READY));
> +		musb_writel(tibase, DAVINCI_TXCPPI_TEAR_REG, chnum);
>  
> -		tx_ram = cppi_ch->state_ram;
> +		txstate = cppi_ch->state_ram;
>  		do {
> -			value = musb_readl(&tx_ram->tx_complete, 0);
> -		} while (0xFFFFFFFC != value);
> -		musb_writel(&tx_ram->tx_complete, 0, 0xFFFFFFFC);
> +			regval = txstate->tx_complete;
> +		} while (0xFFFFFFFC != regval);
> +		txstate->tx_complete = 0xFFFFFFFC;
> +
> +		musb_writel(tibase, DAVINCI_CPPI_EOI_REG, 0);
>  
>  		/* FIXME clean up the transfer state ... here?
>  		 * the completion routine should get called with
>  		 * an appropriate status code.
>  		 */
>  
> -		value = musb_readw(regs, MUSB_TXCSR);
> -		value &= ~MUSB_TXCSR_DMAENAB;
> -		value |= MUSB_TXCSR_FLUSHFIFO;
> -		musb_writew(regs, MUSB_TXCSR, value);
> -		musb_writew(regs, MUSB_TXCSR, value);
> +		regval = musb_readw(cppi_ch->hw_ep->regs, MUSB_TXCSR);
> +		regval |= MUSB_TXCSR_FLUSHFIFO;
> +		musb_writew(cppi_ch->hw_ep->regs, MUSB_TXCSR, regval);
> +		musb_writew(cppi_ch->hw_ep->regs, MUSB_TXCSR, regval);
>  
> -		/* re-enable interrupt */
> -		if (enabled)
> -			musb_writel(tibase, DAVINCI_TXCPPI_INTENAB_REG,
> -					(1 << cppi_ch->index));
> +		txstate->tx_head = 0;
> +		txstate->tx_buf = 0;
> +		txstate->tx_buf_current = 0;
> +		txstate->tx_current = 0;
> +		txstate->tx_info = 0;
> +		txstate->tx_rem_len = 0;
>  
> -		/* While we scrub the TX state RAM, ensure that we clean
> -		 * up any interrupt that's currently asserted:
> +		/* Ensure that we clean up any Interrupt asserted
>  		 * 1. Write to completion Ptr value 0x1(bit 0 set)
>  		 *    (write back mode)
>  		 * 2. Write to completion Ptr value 0x0(bit 0 cleared)
>  		 *    (compare mode)
> -		 * Value written is compared(for bits 31:2) and when
> -		 * equal, interrupt is deasserted.
> +		 * Value written is compared(for bits 31:2) and being
> +		 * equal interrupt deasserted?
> +		 */
> +
> +		/* write back mode, bit 0 set, hence completion Ptr
> +		 * must be updated
>  		 */
> -		cppi_reset_tx(tx_ram, 1);
> -		musb_writel(&tx_ram->tx_complete, 0, 0);
> +		txstate->tx_complete = 0x1;
> +		/* compare mode, write back zero now */
> +		txstate->tx_complete = 0;
> +
> +		/* re-enable interrupt */
> +		if (enabled)
> +			musb_writel(tibase, DAVINCI_TXCPPI_INTENAB_REG,
> +				    (1 << cppi_ch->index));
>  
>  		cppi_dump_tx(5, cppi_ch, " (done teardown)");
>  
> @@ -1448,8 +1299,9 @@ static int cppi_channel_abort(struct dma_channel *channel)
>  		 * as the RX side ... this does no cleanup at all!
>  		 */
>  
> -	} else /* RX */ {
> -		u16			csr;
> +	} else {		/* RX */
> +
> +		u16 csr;
>  
>  		/* NOTE: docs don't guarantee any of this works ...  we
>  		 * expect that if the usb core stops telling the cppi core
> @@ -1457,33 +1309,30 @@ static int cppi_channel_abort(struct dma_channel *channel)
>  		 * current RX DMA state iff any pending fifo transfer is done.
>  		 */
>  
> -		core_rxirq_disable(tibase, cppi_ch->index + 1);
> -
>  		/* for host, ensure ReqPkt is never set again */
>  		if (is_host_active(cppi_ch->controller->musb)) {
> -			value = musb_readl(tibase, DAVINCI_AUTOREQ_REG);
> -			value &= ~((0x3) << (cppi_ch->index * 2));
> -			musb_writel(tibase, DAVINCI_AUTOREQ_REG, value);
> +			regval = musb_readl(tibase, DAVINCI_AUTOREQ_REG);
> +			regval &= ~((0x3) << (cppi_ch->index * 2));
> +			musb_writel(tibase, DAVINCI_AUTOREQ_REG, regval);
>  		}
>  
> -		csr = musb_readw(regs, MUSB_RXCSR);
> +		csr = musb_readw(cppi_ch->hw_ep->regs, MUSB_RXCSR);
>  
>  		/* for host, clear (just) ReqPkt at end of current packet(s) */
>  		if (is_host_active(cppi_ch->controller->musb)) {
>  			csr |= MUSB_RXCSR_H_WZC_BITS;
>  			csr &= ~MUSB_RXCSR_H_REQPKT;
>  		} else
> -			csr |= MUSB_RXCSR_P_WZC_BITS;
> +			csr |= MUSB_RXCSR_H_WZC_BITS;
>  
>  		/* clear dma enable */
>  		csr &= ~(MUSB_RXCSR_DMAENAB);
> -		musb_writew(regs, MUSB_RXCSR, csr);
> -		csr = musb_readw(regs, MUSB_RXCSR);
> +		musb_writew(cppi_ch->hw_ep->regs, MUSB_RXCSR, csr);
>  
> -		/* Quiesce: wait for current dma to finish (if not cleanup).
> -		 * We can't use bit zero of stateram->rx_sop, since that
> +		/* quiesce: wait for current dma to finish (if not cleanup)
> +		 * we can't use bit zero of stateram->sopDescPtr since that
>  		 * refers to an entire "DMA packet" not just emptying the
> -		 * current fifo.  Most segments need multiple usb packets.
> +		 * current fifo, and most segements need multiple fifos.
>  		 */
>  		if (channel->status == MUSB_DMA_STATUS_BUSY)
>  			udelay(50);
> @@ -1491,7 +1340,8 @@ static int cppi_channel_abort(struct dma_channel *channel)
>  		/* scan the current list, reporting any data that was
>  		 * transferred and acking any IRQ
>  		 */
> -		cppi_rx_scan(controller, cppi_ch->index);
> +		cppi_rx_scan(controller, chnum, 1);
> +		channel->actual_len += cppi_ch->actuallen;
>  
>  		/* clobber the existing state once it's idle
>  		 *
> @@ -1510,17 +1360,6 @@ static int cppi_channel_abort(struct dma_channel *channel)
>  
>  		/* ... we don't "free" that list, only mutate it in place.  */
>  		cppi_dump_rx(5, cppi_ch, " (done abort)");
> -
> -		/* clean up previously pending bds */
> -		cppi_bd_free(cppi_ch, cppi_ch->last_processed);
> -		cppi_ch->last_processed = NULL;
> -
> -		while (queue) {
> -			struct cppi_descriptor	*tmp = queue->next;
> -
> -			cppi_bd_free(cppi_ch, queue);
> -			queue = tmp;
> -		}
>  	}
>  
>  	channel->status = MUSB_DMA_STATUS_FREE;
> diff --git a/drivers/usb/musb/cppi_dma.h b/drivers/usb/musb/cppi_dma.h
> index fc5216b..0d15901 100644
> --- a/drivers/usb/musb/cppi_dma.h
> +++ b/drivers/usb/musb/cppi_dma.h
> @@ -11,120 +11,150 @@
>  
>  #include "musb_dma.h"
>  #include "musb_core.h"
> +#include "davinci.h"
>  
> +/* hOptions bit masks for CPPI BDs */
> +#define CPPI_SOP_SET	((u32)(1 << 31))
> +#define CPPI_EOP_SET	((u32)(1 << 30))
> +#define CPPI_OWN_SET	((u32)(1 << 29))	/* owned by cppi */
> +#define CPPI_EOQ_MASK	((u32)(1 << 28))
> +#define CPPI_ZERO_SET	((u32)(1 << 23))	/* rx saw zlp; tx issues one */
> +#define CPPI_RXABT_MASK	((u32)(1 << 19))	/* need more rx buffers */
>  
> -/* FIXME fully isolate CPPI from DaVinci ... the "CPPI generic" registers
> - * would seem to be shared with the TUSB6020 (over VLYNQ).
> - */
> -
> -#include "davinci.h"
> +#define CPPI_RECV_PKTLEN_MASK 0xFFFF
> +#define CPPI_BUFFER_LEN_MASK 0xFFFF
>  
> +#define CPPI_TEAR_READY ((u32)(1 << 31))
> +#define CPPI_CHNUM_BITS_MASK  0x3
>  
>  /* CPPI RX/TX state RAM */
>  
>  struct cppi_tx_stateram {
> -	u32 tx_head;			/* "DMA packet" head descriptor */
> +	u32 tx_head;		    	/* "DMA packet" head descriptor */
>  	u32 tx_buf;
> -	u32 tx_current;			/* current descriptor */
> +	u32 tx_current;		 	/* current descriptor */
>  	u32 tx_buf_current;
> -	u32 tx_info;			/* flags, remaining buflen */
> +	u32 tx_info;		    	/* flags, remaining buflen */
>  	u32 tx_rem_len;
> -	u32 tx_dummy;			/* unused */
> +	u32 tx_dummy;		   	/* unused */
>  	u32 tx_complete;
>  };
>  
>  struct cppi_rx_stateram {
>  	u32 rx_skipbytes;
>  	u32 rx_head;
> -	u32 rx_sop;			/* "DMA packet" head descriptor */
> -	u32 rx_current;			/* current descriptor */
> +	u32 rx_sop;		     	/* "DMA packet" head descriptor */
> +	u32 rx_current;		 	/* current descriptor */
>  	u32 rx_buf_current;
>  	u32 rx_len_len;
>  	u32 rx_cnt_cnt;
>  	u32 rx_complete;
>  };
>  
> -/* hw_options bits in CPPI buffer descriptors */
> -#define CPPI_SOP_SET	((u32)(1 << 31))
> -#define CPPI_EOP_SET	((u32)(1 << 30))
> -#define CPPI_OWN_SET	((u32)(1 << 29))	/* owned by cppi */
> -#define CPPI_EOQ_MASK	((u32)(1 << 28))
> -#define CPPI_ZERO_SET	((u32)(1 << 23))	/* rx saw zlp; tx issues one */
> -#define CPPI_RXABT_MASK	((u32)(1 << 19))	/* need more rx buffers */
> -
> -#define CPPI_RECV_PKTLEN_MASK 0xFFFF
> -#define CPPI_BUFFER_LEN_MASK 0xFFFF
> -
> -#define CPPI_TEAR_READY ((u32)(1 << 31))
> -
>  /* CPPI data structure definitions */
>  
> -#define	CPPI_DESCRIPTOR_ALIGN	16	/* bytes; 5-dec docs say 4-byte align */
> +/**
> + *  CPPI  Buffer Descriptor
> + *
> + *  Buffer Descriptor structure for USB OTG Module CPPI.Using the same across
> + *  Tx/Rx
> + */
> +
> +#define	CPPI_DESCRIPTOR_ALIGN	16	/* bytes; 5-dec docs say 4-byte align*/
>  
>  struct cppi_descriptor {
> -	/* hardware overlay */
> -	u32		hw_next;	/* next buffer descriptor Pointer */
> -	u32		hw_bufp;	/* i/o buffer pointer */
> -	u32		hw_off_len;	/* buffer_offset16, buffer_length16 */
> -	u32		hw_options;	/* flags:  SOP, EOP etc*/
> -
> -	struct cppi_descriptor *next;
> -	dma_addr_t	dma;		/* address of this descriptor */
> -	u32		buflen;		/* for RX: original buffer length */
> +	/* Hardware Overlay */
> +	u32 hw_next;     /**< Next(hardware) Buffer Descriptor Pointer */
> +	u32 hw_bufp;	   /**<Buffer Pointer (dma_addr_t) */
> +	u32 hw_off_len;	    /**<Buffer_offset16,buffer_length16 */
> +	u32 hw_options;	    /**<Option fields for SOP,EOP etc*/
> +
> +	struct cppi_descriptor *next; /**<Next(software) Buffer Descriptor
> +					*pointer
> +					*/
> +	dma_addr_t dma;		/* address of this descriptor */
> +	/* for Rx Desc, keep track of enqueued Buffer len to detect
> +	 * short packets
> +	 */
> +	u32 buflen;
>  } __attribute__ ((aligned(CPPI_DESCRIPTOR_ALIGN)));
>  
> -
> +/* forward declaration for CppiDmaController structure */
>  struct cppi;
>  
> -/* CPPI  Channel Control structure */
> +/**
> + *  Channel Control Structure
> + *
> + * CPPI  Channel Control structure. Using he same for Tx/Rx. If need be
> + * derive out of this later.
> + */
>  struct cppi_channel {
> -	struct dma_channel	channel;
> +	/* First field must be dma_channel for easy type casting
> +	 * FIXME just use container_of() and be typesafe instead!
> +	 */
> +	struct dma_channel channel;
>  
> -	/* back pointer to the DMA controller structure */
> -	struct cppi		*controller;
> +	/* back pointer to the Dma Controller structure */
> +	struct cppi *controller;
>  
>  	/* which direction of which endpoint? */
> -	struct musb_hw_ep	*hw_ep;
> -	bool			transmit;
> -	u8			index;
> +	struct musb_hw_ep *hw_ep;
> +	u8 transmit;
> +	u8 index;
>  
>  	/* DMA modes:  RNDIS or "transparent" */
> -	u8			is_rndis;
> -
> +	u8 is_rndis;
> +	u8 autoreq;		/* For now keep this remove this
> +				 * one RNDIS + length <64 segmenstation
> +				 * will done
> +				 */
> +	/* Rx Requested mode */
> +	u8 rxmode;
> +	u8 reqcomplete;		/* zero packet handling*/
>  	/* book keeping for current transfer request */
> -	dma_addr_t		buf_dma;
> -	u32			buf_len;
> -	u32			maxpacket;
> -	u32			offset;		/* dma requested */
> +	dma_addr_t buf_dma;
> +	u32 buf_len;
> +	u32 maxpacket;
> +	u32 offset;		/* requested segments */
> +	u32 actuallen;		/* completed (Channel.actual) */
>  
> -	void __iomem		*state_ram;	/* CPPI state */
> -
> -	struct cppi_descriptor	*freelist;
> +	void __iomem *state_ram;	/* CPPI state */
>  
>  	/* BD management fields */
> -	struct cppi_descriptor	*head;
> -	struct cppi_descriptor	*tail;
> -	struct cppi_descriptor	*last_processed;
> +	struct cppi_descriptor *freelist;	/* Free BD Pool head pointer */
> +	struct cppi_descriptor *head;
> +	struct cppi_descriptor *tail;
> +	struct cppi_descriptor *last_processed;
>  
>  	/* use tx_complete in host role to track endpoints waiting for
>  	 * FIFONOTEMPTY to clear.
>  	 */
> -	struct list_head	tx_complete;
> +	struct list_head tx_complete;
> +	u32 iso_desc;
>  };
>  
> -/* CPPI DMA controller object */
> +/**
> + *  CPPI Dma Controller Object
> + *
> + *  CPPI Dma controller object.Encapsulates all bookeeping and Data
> + *  structures pertaining to the CPPI Dma Controller.
> + */
>  struct cppi {
> -	struct dma_controller		controller;
> -	struct musb			*musb;
> -	void __iomem			*mregs;		/* Mentor regs */
> -	void __iomem			*tibase;	/* TI/CPPI regs */
> +	/* FIXME switchover to container_of() and remove the
> +	 * unsafe typecasts...
> +	 */
> +	struct dma_controller controller;
> +	struct musb *musb;
> +	void __iomem *mregs;
> +	void __iomem *tibase;
> +
>  
> -	struct cppi_channel		tx[MUSB_C_NUM_EPT - 1];
> -	struct cppi_channel		rx[MUSB_C_NUM_EPR - 1];
> +	struct cppi_channel tx[MUSB_C_NUM_EPT - 1];
> +	struct cppi_channel rx[MUSB_C_NUM_EPR - 1];
>  
> -	struct dma_pool			*pool;
> +	struct dma_pool *pool;
>  
> -	struct list_head		tx_complete;
> +	struct list_head tx_complete;
>  };
>  
>  /* irq handling hook */
> diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c
> index 407e422..5cec9ee 100644
> --- a/drivers/usb/musb/musb_core.c
> +++ b/drivers/usb/musb/musb_core.c
> @@ -60,23 +60,23 @@
>   *    includes DaVinci EVM in a common non-OTG mode.
>   *
>   *      * Control and bulk use dedicated endpoints, and there's as
> - *        yet no mechanism to either (a) reclaim the hardware when
> - *        peripherals are NAKing, which gets complicated with bulk
> - *        endpoints, or (b) use more than a single bulk endpoint in
> - *        each direction.
> + *	yet no mechanism to either (a) reclaim the hardware when
> + *	peripherals are NAKing, which gets complicated with bulk
> + *	endpoints, or (b) use more than a single bulk endpoint in
> + *	each direction.
>   *
> - *        RESULT:  one device may be perceived as blocking another one.
> + *	RESULT:  one device may be perceived as blocking another one.
>   *
>   *      * Interrupt and isochronous will dynamically allocate endpoint
> - *        hardware, but (a) there's no record keeping for bandwidth;
> - *        (b) in the common case that few endpoints are available, there
> - *        is no mechanism to reuse endpoints to talk to multiple devices.
> + *	hardware, but (a) there's no record keeping for bandwidth;
> + *	(b) in the common case that few endpoints are available, there
> + *	is no mechanism to reuse endpoints to talk to multiple devices.
>   *
> - *        RESULT:  At one extreme, bandwidth can be overcommitted in
> - *        some hardware configurations, no faults will be reported.
> - *        At the other extreme, the bandwidth capabilities which do
> - *        exist tend to be severely undercommitted.  You can't yet hook
> - *        up both a keyboard and a mouse to an external USB hub.
> + *	RESULT:  At one extreme, bandwidth can be overcommitted in
> + *	some hardware configurations, no faults will be reported.
> + *	At the other extreme, the bandwidth capabilities which do
> + *	exist tend to be severely undercommitted.  You can't yet hook
> + *	up both a keyboard and a mouse to an external USB hub.
>   */
>  
>  /*
> @@ -2028,6 +2028,11 @@ musb_init_controller(struct device *dev, int nIrq, void __iomem *ctrl)
>  		musb->xceiv.host = &hcd->self;
>  		hcd->power_budget = 2 * (plat->power ? : 250);
>  	}
> +
> +#ifdef CONFIG_ARCH_DAVINCI
> +	tasklet_init (&musb->fifo_check, musb_fifo_check_tasklet,
> +			(unsigned long)musb);
> +#endif
>  #endif				/* CONFIG_USB_MUSB_HDRC_HCD */
>  
>  	/* For the host-only role, we can activate right away.
> diff --git a/drivers/usb/musb/musb_core.h b/drivers/usb/musb/musb_core.h
> index 332d39d..2fa34d1 100644
> --- a/drivers/usb/musb/musb_core.h
> +++ b/drivers/usb/musb/musb_core.h
> @@ -239,6 +239,12 @@ enum musb_g_ep0_state {
>  	(musb_readb((_x)->mregs, MUSB_DEVCTL)&MUSB_DEVCTL_HM)
>  
>  #define MUSB_MODE(musb) ((musb)->is_host ? "Host" : "Peripheral")
> +struct musb_iso_desc {
> +	u32     offset;
> +	u32     length;
> +	u32     status;
> +};
> +
>  
>  /******************************** TYPES *************************************/
>  
> @@ -285,6 +291,9 @@ struct musb_hw_ep {
>  
>  	u8			rx_reinit;
>  	u8			tx_reinit;
> +#ifdef CONFIG_ARCH_DAVINCI
> +	u8			fifo_flush_check;	 /* Check FIFO empty */
> +#endif
>  #endif
>  
>  #ifdef CONFIG_USB_GADGET_MUSB_HDRC
> @@ -292,6 +301,9 @@ struct musb_hw_ep {
>  	struct musb_ep		ep_in;			/* TX */
>  	struct musb_ep		ep_out;			/* RX */
>  #endif
> +	struct musb_iso_desc *iso_desc;
> +	u32     num_iso_desc;
> +	u8 zero;
>  };
>  
>  static inline struct usb_request *next_in_request(struct musb_hw_ep *hw_ep)
> @@ -343,6 +355,11 @@ struct musb {
>  	struct list_head	in_bulk;	/* of musb_qh */
>  	struct list_head	out_bulk;	/* of musb_qh */
>  	struct musb_qh		*periodic[32];	/* tree of interrupt+iso */
> +#ifdef CONFIG_ARCH_DAVINCI
> +	struct tasklet_struct	 fifo_check;	/* tasklet for FIFO empty
> +						 * status check
> +						 */
> +#endif
>  #endif
>  
>  	/* called with IRQs blocked; ON/nonzero implies starting a session,
> diff --git a/drivers/usb/musb/musb_host.c b/drivers/usb/musb/musb_host.c
> index d8ea84d..3b640e9 100644
> --- a/drivers/usb/musb/musb_host.c
> +++ b/drivers/usb/musb/musb_host.c
> @@ -154,7 +154,8 @@ static inline void cppi_host_txdma_start(struct musb_hw_ep *ep)
>  
>  	/* NOTE: no locks here; caller should lock and select EP */
>  	txcsr = musb_readw(ep->regs, MUSB_TXCSR);
> -	txcsr |= MUSB_TXCSR_DMAENAB | MUSB_TXCSR_H_WZC_BITS;
> +	txcsr |= MUSB_TXCSR_DMAENAB | MUSB_TXCSR_DMAMODE |
> +			MUSB_TXCSR_H_WZC_BITS;
>  	musb_writew(ep->regs, MUSB_TXCSR, txcsr);
>  }
>  
> @@ -1242,6 +1243,9 @@ void musb_host_tx(struct musb *musb, u8 epnum)
>  		/* REVISIT may need to clear FLUSHFIFO ... */
>  		musb_writew(epio, MUSB_TXCSR, tx_csr);
>  		musb_writeb(epio, MUSB_TXINTERVAL, 0);
> +#ifdef CONFIG_ARCH_DAVINCI
> +		hw_ep->fifo_flush_check = 0;
> +#endif
>  
>  		done = true;
>  	}
> @@ -1305,8 +1309,19 @@ void musb_host_tx(struct musb *musb, u8 epnum)
>  		/* set status */
>  		urb->status = status;
>  		urb->actual_length = qh->offset;
> +#ifdef CONFIG_ARCH_DAVINCI
> +		/* Check for FIFO empty status.  If not wait for the same
> +		 * before completing the urb.  This ensures that the toggle
> +		 * status is correctly preserved and data will not be lost.
> +		 */
> +		if ((tx_csr & MUSB_TXCSR_FIFONOTEMPTY) ||
> +			(tx_csr & MUSB_TXCSR_TXPKTRDY)) {
> +			hw_ep->fifo_flush_check = 1;
> +			tasklet_schedule(&musb->fifo_check);
> +			goto finish;
> +		}
> +#endif
>  		musb_advance_schedule(musb, urb, hw_ep, USB_DIR_OUT);
> -
>  	} else if (!(tx_csr & MUSB_TXCSR_DMAENAB)) {
>  		/* WARN_ON(!buf); */
>  
> @@ -1358,9 +1373,9 @@ finish:
>   *       (a) all URBs terminate with REQPKT cleared and fifo(s) empty;
>   *       (b) termination conditions are: short RX, or buffer full;
>   *       (c) fault modes include
> - *           - iff URB_SHORT_NOT_OK, short RX status is -EREMOTEIO.
> - *             (and that endpoint's dma queue stops immediately)
> - *           - overflow (full, PLUS more bytes in the terminal packet)
> + *	   - iff URB_SHORT_NOT_OK, short RX status is -EREMOTEIO.
> + *	     (and that endpoint's dma queue stops immediately)
> + *	   - overflow (full, PLUS more bytes in the terminal packet)
>   *
>   *	So for example, usb-storage sets URB_SHORT_NOT_OK, and would
>   *	thus be a great candidate for using mode 1 ... for all but the
> @@ -1944,6 +1959,9 @@ static int musb_cleanup_urb(struct urb *urb, struct musb_qh *qh, int is_in)
>  		musb_writew(epio, MUSB_TXCSR, csr);
>  		/* flush cpu writebuffer */
>  		csr = musb_readw(epio, MUSB_TXCSR);
> +#ifdef CONFIG_ARCH_DAVINCI
> +		ep->fifo_flush_check = 0;
> +#endif
>  	}
>  	if (status == 0)
>  		musb_advance_schedule(ep->musb, urb, ep, is_in);
> @@ -2139,6 +2157,46 @@ static int musb_bus_suspend(struct usb_hcd *hcd)
>  		return 0;
>  }
>  
> +#ifdef CONFIG_ARCH_DAVINCI
> +/* Tasklet routine to handle the completion request. Check for Fifo status
> + * before completing the request. Avoids false completions when data is still
> + * in the fifo
> + */
> +void musb_fifo_check_tasklet(unsigned long data)
> +{
> +	struct musb		 *musb = (struct musb *)data;
> +	u8			 epnum = 1, sch_tsklt = 0;
> +	struct musb_hw_ep	 *hw_ep = NULL;
> +	unsigned long		 flags;
> +	u16			 csr;
> +	struct musb_qh		*qh;
> +
> +	do {
> +		hw_ep = &(musb->endpoints[epnum++]);
> +		spin_lock_irqsave(&musb->lock, flags);
> +		if (hw_ep->fifo_flush_check) {
> +			csr = musb_readw(hw_ep->regs, MUSB_TXCSR);
> +			if (((csr & MUSB_TXCSR_FIFONOTEMPTY) ||
> +				(csr & MUSB_TXCSR_TXPKTRDY)))
> +				sch_tsklt = 1;
> +			else {
> +				hw_ep->fifo_flush_check = 0;
> +				qh = hw_ep->out_qh;
> +				musb_advance_schedule(musb, next_urb(qh),
> +							hw_ep, USB_DIR_OUT);
> +				DBG(6, "Completed Tasklet %d\n", hw_ep->epnum);
> +			}
> +		}
> +
> +		spin_unlock_irqrestore(&musb->lock, flags);
> +	} while (epnum < MUSB_C_NUM_EPS);
> +
> +	if (sch_tsklt)
> +		tasklet_schedule(&musb->fifo_check);
> +}
> +#endif
> +
> +
>  static int musb_bus_resume(struct usb_hcd *hcd)
>  {
>  	/* resuming child port does the work */
> diff --git a/drivers/usb/musb/musb_host.h b/drivers/usb/musb/musb_host.h
> index 77bcdb9..ae598f8 100644
> --- a/drivers/usb/musb/musb_host.h
> +++ b/drivers/usb/musb/musb_host.h
> @@ -88,7 +88,9 @@ extern int musb_hub_status_data(struct usb_hcd *hcd, char *buf);
>  extern int musb_hub_control(struct usb_hcd *hcd,
>  			u16 typeReq, u16 wValue, u16 wIndex,
>  			char *buf, u16 wLength);
> -
> +#ifdef CONFIG_ARCH_DAVINCI
> +extern void musb_fifo_check_tasklet (unsigned long data);
> +#endif
>  extern const struct hc_driver musb_hc_driver;
>  
>  static inline struct urb *next_urb(struct musb_qh *qh)
> @@ -108,3 +110,4 @@ static inline struct urb *next_urb(struct musb_qh *qh)
>  }
>  
>  #endif				/* _MUSB_HOST_H */
> +
>   
Hi frank,

Once again thanks for your help. My test results  are (after applying 
dma patch)
write ~ 2.5 MB/s
read  ~ 8 MB/s.

Thanks & Regards,
Padmanabha.s
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://linux.omap.com/pipermail/davinci-linux-open-source/attachments/20080904/7e585fce/attachment-0001.htm


More information about the Davinci-linux-open-source mailing list