/* * Somewhat Deficient TCP Server Connection Emulator (SDTSCE) * * This code emulates the server side of a TCP connection, in a * somewhat deficient manner. It waits for a SYN on a port, * responds with a SYN+ACK, and then sends packets representing a * fixed payload response (drawn from a real Apache web server) * back to the connecting client. It neither stores nor ACKs any * client data. * * This code is still not a proper approach to a userland TCP stack. * Its purpose is to give you something to explore on the packet level. * Look for examples of userland stacks, e.g., linked from * https://jvns.ca/blog/2016/06/30/why-do-we-use-the-linux-kernels-tcp-stack/ * * This version handles retransmissions: for every packet sent, a * timer is started. If the packet is not ACK-ed within 1 second, * it is retransmitted and the retransmission timer is reset. * Retransmissions will continue until the payload of the packet * is ACK-ed (including cummulatively). * * This code dynamically allocates memory for the packets * in flight, and uses the _timer FD_ to keep track of allocations * in the array timer2pkt[]. * Timer FDs are created by the system, and guaranteed to be unique, * so packets will not collide! I am using the system as a source of * unique IDs, taking advantage of its synchronization. * * Note: this requires an array, but so does poll(). Userland multiplexing * between several sources of events is tricky, whereas the kernel * itself handles this need very differently, via interrupt handlers. * For various approaches to userland polling, see the progression * from select() to poll() to epoll() . * https://www.ulduzsoft.com/2014/01/select-poll-epoll-practical-difference-for-system-architects/ * https://daniel.haxx.se/docs/poll-vs-select.html * * This code does not (yet) parse TCP options for MSS and SACKs. * * This code does not do proper retransmission timer back-off. * * This code doees not ACK incoming data! You will see it retransmitted by the * connecting client. Fix this! * * Find bugs, get extra points! */ #include #include #include #include #include // open #include #include #include #include #include // poll #include // timers #include // inet_addr #include #include #include "pkt.h" #include "tcb.h" // Keep the initial Seq number the same for easier debugging #define INIT_SEQ 0xAAAABBBB // Send TCP payloads in chunks of this size: #define MSS 1000 // Preallocate space for this many packets in flight and their respective timers #define MAX_TIMERS 500 #define TCP_LISTEN_PORT 80 #define TCP_LISTEN_ADDR ntohl(inet_addr("192.168.56.100")) int init_payload( const char* ); int get_payload( char *payload, int *payload_sz, size_t sz, tcb_t *tcb ); void print_packet_info(FILE*, pkt_t *pkt); void print_packet_summary(FILE *s, pkt_t *pkt); int get_window_size( pkt_t* pkt, int* window); void update_window(pkt_t *pkt, tcb_t *tcb); void process_ack(pkt_t *pkt, tcb_t *tcb); void send_and_set_timer(pkt_t *); void link_packet(pkt_t *, tcb_t *); void init_connection(tcb_t *tcp); void reset_connection(tcb_t *tcp); void fill_window_with_payloads(pkt_t*, tcb_t*); pkt_t* make_and_send_synack( pkt_t *in, tcb_t *tcb ); pkt_t* make_and_send_payload_with_ack( pkt_t *in, char *payload, int payload_sz, tcb_t *tcb); pkt_t* make_and_send_fin( pkt_t *in, tcb_t *tcb ); pkt_t* make_and_send_final_ack( pkt_t *in, tcb_t *tcb ); int set_rto_timer(pkt_t*); void reset_rto_timer(pkt_t*, int i); void clear_rto_timer(pkt_t *pkt); int matches_tcb(pkt_t*, tcb_t*); uint16_t checksum (uint16_t *, int); uint16_t tcp4_checksum (char *packet, struct iphdr *ip, struct tcphdr *tcp, int tcp_payload_len); int tcp_raw_socket(); void recv_packet(pkt_t *pkt); pkt_t* make_packet(); // global timers struct pollfd polled[MAX_TIMERS]; // timer fds are guaranteed to be unique by the system; // save a pointer to the timer's packet (and save the fd in the pkt_t) pkt_t *timer2pkt[MAX_TIMERS]; // timer_fd -> pkt_t* // this works with the polled[] array for poll()-ing. Can be made non-global, // but poll() logic is still a concession to userland. int num_polled = 1; // slot 0 in polled[] is the raw socket int main(){ int sockfd; int i; pkt_t pkt; tcb_t tcb; sockfd = tcp_raw_socket(); init_connection(&tcb); // Polled will always start with the raw socket, then use slots 1..MAX_TIMERS-1 for timers polled[0].fd = sockfd; polled[0].events = POLLIN; for( i=1; i < MAX_TIMERS; i++ ){ polled[i].fd = -1; // poll ignores negative fds polled[i].events = POLLIN; } // done with init_connection // State machine implementation. Nothing is allowed to fall through without // at least a warning. while(1){ recv_packet(&pkt); // blocking; gets a packet, handles a timer, or exits. if( pkt.invalid ) // timeout in poll, skip to next iteration continue; if( ! matches_tcb( &pkt, &tcb) ){ // a real TCP stack would send a RST here fprintf(stderr, "Packet doesn't match the TCB, ignoring packet:\n"); print_packet_summary(stderr, &pkt); continue; } if( IS_SYN(&pkt) && tcb.state == TCP_LISTEN ){ // send SYN+ACK make_and_send_synack( &pkt, &tcb); update_window( &pkt, &tcb ); tcb.state = TCP_SYN_RECV; } // This is the ACK completing our 3-way handshake else if( IS_ACK(&pkt) && tcb.state == TCP_SYN_RECV && GET_ACK(&pkt) == INIT_SEQ+1 ){ printf( "Handshake complete.\n"); tcb.state = TCP_ESTABLISHED; // This is a hack: we start sending data as soon as the connection // handshake is complete. This is what "nc -l 80 < payload_file" // would do. // Note that there may be several ACKs matching the above test. // Send payload only once. See oversend.pcap with filter // (ip.dst == 192.168.56.100) && (tcp.ack == 1) if( ! tcb.payload_started ){ fill_window_with_payloads(&pkt, &tcb); tcb.payload_started = 1; // After we are done with our payload, we'd send a FIN. // But it would mean following the Initiator sequence, // through FIN-WAIT-1, FIN-WAIT-2, etc., // so instead we just wait for incoming FIN, letting // the peer close the connection. //make_and_send_fin( &pkt, &tcb); //tcb.state = TCP_FIN_WAIT1; } } // This is a regular ACK. Find the struct for that packet, and stop its timer. else if( IS_ACK(&pkt) && tcb.state == TCP_ESTABLISHED ){ process_ack(&pkt, &tcb); update_window( &pkt, &tcb ); fill_window_with_payloads(&pkt, &tcb); // stay in ESTABLISHED state // this ACK may have data in it that needs to be ACK-ed. Do it. } else if( IS_ACK(&pkt) && tcb.state == TCP_LAST_ACK ){ // BUG?: check if it matches FIN! process_ack(&pkt, &tcb); // we just got the ACK for our FIN; go back to listening tcb.state = TCP_LISTEN; reset_connection(&tcb); } // Deal with FIN+ACK else if( IS_FINACK(&pkt) && tcb.state == TCP_ESTABLISHED ){ process_ack(&pkt, &tcb); make_and_send_final_ack( &pkt, &tcb); //tcb.state = TCP_CLOSE_WAIT; // By this time, our "application" is already closed; // so send FIN immediately and go to LAST-ACK make_and_send_fin( &pkt, &tcb); tcb.state = TCP_LAST_ACK; } else if( IS_FIN(&pkt) && tcb.state == TCP_ESTABLISHED ){ make_and_send_final_ack( &pkt, &tcb); //tcb.state = TCP_CLOSE_WAIT; // By this time, our application is already closed; so // send FIN and go to LAST-ACK make_and_send_fin( &pkt, &tcb); tcb.state = TCP_LAST_ACK; } else if( IS_RST(&pkt) ){ fprintf(stderr, "Got a RST, exiting:\n" ); print_packet_summary(stderr, &pkt); exit(0); } else if( IS_ACK(&pkt) ){ // a real TCP stack would sent a "go-away" RST here fprintf(stderr, "ACK in wrong state %d (%s), ignoring packet\n", tcb.state, tcp_states[tcb.state] ); } else { fprintf(stderr, "Wrong state %d (%s) for packet:\n", tcb.state, tcp_states[tcb.state] ); print_packet_info(stderr, &pkt); // A special case: repeated SYN in TCP_SYN_RECV. A handshake-completing // ACK may still be coming; ignore SYN and don't reset the connection. if( IS_SYN(&pkt) && tcb.state == TCP_SYN_RECV ) continue; // Would send a RST here; instead, we just go to LISTEN tcb.state = TCP_LISTEN; reset_connection(&tcb); } } } /// init_connection void init_connection(tcb_t *tcb) { tcb->dport = TCP_LISTEN_PORT; tcb->daddr = TCP_LISTEN_ADDR; tcb->sport = 0; // to be filled in on SYN tcb->saddr = 0; // ditto tcb->window_scale = 0; tcb->state = TCP_LISTEN; // start in LISTEN tcb->first_unacked = NULL; tcb->last_sent = NULL; tcb->payload_started = 0; tcb->payload_fd = init_payload( "http-jpg-response.txt" ); //global! num_polled = 1; // initially, no timers } void reset_connection(tcb_t *tcb) { pkt_t *p, *q; // clear info about the other end tcb->saddr = 0; tcb->sport = 0; fprintf(stderr, "Resetting connection to LISTEN\n"); // free any pending packets and their timers p = tcb->first_unacked; while( p != NULL ){ fprintf(stderr, "Freeing packet %p\n", p); print_packet_summary(stderr, p); q = p; p = p->next; clear_rto_timer(q); } tcb->first_unacked = NULL; tcb->last_sent = NULL; close(tcb->payload_fd); // restart payload tcb->payload_fd = init_payload( "http-jpg-response.txt" ); num_polled = 1; } // Find the ACK-ed packet's RTO timer and stop it. void process_ack(pkt_t *pkt, tcb_t *tcb) { uint32_t ack = GET_ACK(pkt); fprintf( stderr, "ACK for %X\n", ack ); pkt_t *p, *q, *r; int match_found = 0; // Find the packet that just got ACK-ed for( p = tcb->first_unacked; p != NULL && ack >= p->ack ; p = p->next ){ //fprintf(stderr, "Trying packet %p\n", p); //print_packet_summary(stderr, p); if( p->ack == ack ){ match_found = 1; // ACK is cummulative, so clear all packets up to the found p q = tcb->first_unacked; while( q != p ){ r = q; q = q->next; clear_rto_timer(r); // frees the packet } fprintf(stderr, "Moding first_unacked %p -> %p, packet ", tcb->first_unacked, p->next); if( p->next != NULL ) print_packet_summary(stderr, p->next); tcb->first_unacked = p->next; if( tcb->first_unacked != NULL ) tcb->first_unacked->prev = NULL; if( p == tcb->last_sent ) // all list has been emptied tcb->last_sent = NULL; clear_rto_timer(p); // frees the packet break; } } if( !match_found ) fprintf(stderr, "Packet for ACK %X not found! Duplicate ACK or SACK?\n", ack); } // Send as many as fit within the window void fill_window_with_payloads(pkt_t *pkt, tcb_t *tcb) { // payload, hacked-up for a file char payload[MSS]; int payload_sz; int byte_cnt = 0; int unacked_cnt = 0; int i = 1; // count packets in this batch, from 1 int wnd = tcb->window; // calculate how many unACK-ed bytes exist; subtract from window if( tcb->first_unacked != NULL ){ pkt_t *p; for( p = tcb->first_unacked; p != NULL; p = p->next ) unacked_cnt += p->payload_sz; } if( wnd <= unacked_cnt ){ fprintf(stderr, "No room in window: wnd %d, unacked %d\n", wnd, unacked_cnt); return; // nothing we can send at the moment } wnd -= unacked_cnt; fprintf(stderr, "Sending payloads for window %d, unacked %d, remaining %d\n", tcb->window, unacked_cnt, wnd); // what to do if wnd < MSS? Just send one packet of wnd bytes while( byte_cnt < wnd ){ int bite = (wnd - byte_cnt >= MSS) ? MSS : wnd - byte_cnt; if( get_payload( payload, &payload_sz, bite, tcb ) > 0 ){ pkt_t *pkt_in_flight = make_and_send_payload_with_ack( pkt, payload, payload_sz, tcb ); byte_cnt += payload_sz; fprintf(stderr, "Sent %d bytes of payload in pkt %d, seq %X, ack %X\n", byte_cnt, i, pkt_in_flight->seq, pkt_in_flight->ack); i++; } else return; // end of file in payload? } } // Saves created SYN+ACK in resp and srcIP, srcPort in TCB // NOTE: this echoes back TCP options from the SYN, including // window scaling and SACK permitted. These will affect // the rest of the connection very substantially. pkt_t* make_and_send_synack( pkt_t *pkt, tcb_t *tcb ) { struct iphdr *ip = pkt->ip; struct tcphdr *tcp = pkt->tcp; int sockfd = pkt->sockfd; char *buf = pkt->buf; int n = pkt->len; pkt_t *resp = make_packet(); resp->sockfd = sockfd; // response will go out of the same socket char *res = resp->buf; memcpy(res, buf, n); struct iphdr *r_ip = (struct iphdr*) res; struct tcphdr *r_tcp = (struct tcphdr*) (res + 4*ip->ihl); r_ip->daddr = ip->saddr; r_ip->saddr = ip->daddr; r_tcp->source = tcp->dest; r_tcp->dest = tcp->source; r_tcp->ack_seq = htonl( ntohl(tcp->seq)+1 ); // consume one seq number; all of this to just add 1 r_tcp->seq = htonl( INIT_SEQ ); r_tcp->ack = 1; // flags | 0x02 r_tcp->check = 0; // checksum will include this field, so it should be zeroed out tcb->seq = INIT_SEQ+1; // keep this in host order tcb->ack = ntohl( r_tcp->ack_seq ); // also host order // set the saddr and dport here (in host order) tcb->sport = ntohs(tcp->source); tcb->saddr = htonl(ip->saddr); // recompute the checksums r_ip->check = checksum( (uint16_t *) res, 4 * ip->ihl); r_tcp->check = tcp4_checksum (res, r_ip, r_tcp, n - 4*ip->ihl - 4*tcp->doff); resp->len = n; resp->sin = pkt->sin; resp->ip = r_ip; resp->tcp = r_tcp; resp->ack = tcb->seq; link_packet(resp, tcb); send_and_set_timer(resp); return resp; } void link_packet(pkt_t *pkt, tcb_t *tcb) { fprintf(stderr, "Linking packet %p: ", pkt); print_packet_summary(stderr, pkt); if( tcb->first_unacked == NULL && tcb->last_sent == NULL ){ tcb->first_unacked = pkt; tcb->last_sent = pkt; pkt->prev = NULL; pkt->next = NULL; } else if( tcb->first_unacked != NULL && tcb->last_sent != NULL ){ tcb->last_sent->next = pkt; pkt->prev = tcb->last_sent; pkt->next = NULL; tcb->last_sent = pkt; } else{ fprintf(stderr, "Broken packet list first_unacked=%p last_sent=%p at packet:", tcb->first_unacked, tcb->last_sent ); print_packet_summary(stderr, pkt); exit(1); } } void send_and_set_timer(pkt_t *pkt) { int fd = set_rto_timer(pkt); if( fd >= MAX_TIMERS ){ fprintf(stderr, "Out of timers! Bailing.\n"); exit(1); } timer2pkt[fd] = pkt; // send packet if (sendto ( pkt->sockfd, pkt->buf, pkt->len, 0, (struct sockaddr*) &(pkt->sin), sizeof (struct sockaddr_in)) < 0) { perror ("sendto() failed "); exit (1); } } pkt_t* make_and_send_final_ack( pkt_t *pkt, tcb_t *tcb ) { struct iphdr *ip = pkt->ip; struct tcphdr *tcp = pkt->tcp; int sockfd = pkt->sockfd; char *buf = pkt->buf; // In this case, we DON'T want to echo the packet back! // We just want its IP and TCP headers int n = 4*ip->ihl + 4*tcp->doff; // pkt->len; pkt_t *resp = make_packet(); resp->sockfd = sockfd; // response will go out of the same socket char *res = resp->buf; memcpy(res, buf, n); struct iphdr *r_ip = (struct iphdr*) res; struct tcphdr *r_tcp = (struct tcphdr*) (res + 4*ip->ihl); r_ip->daddr = ip->saddr; r_ip->saddr = ip->daddr; r_tcp->source = tcp->dest; r_tcp->dest = tcp->source; r_tcp->ack_seq = htonl( ntohl(tcp->seq) +1 ); // consume one seq number r_tcp->seq = tcp->ack_seq; r_tcp->ack = 1; // flags | 0x02 r_tcp->fin = 0; // clear FIN r_tcp->check = 0; //recompute the checksums r_ip->check = checksum( (uint16_t *) res, 4 * ip->ihl); r_tcp->check = tcp4_checksum (res, r_ip, r_tcp, n - 4*ip->ihl - 4*tcp->doff); resp->len = n; resp->sin = pkt->sin; resp->ip = r_ip; resp->tcp = r_tcp; // link it up link_packet(resp, tcb); send_and_set_timer(resp); return resp; } pkt_t* make_and_send_payload_with_ack( pkt_t *pkt, char *payload, int payload_sz, tcb_t *tcb ) { struct iphdr *ip = pkt->ip; struct tcphdr *tcp = pkt->tcp; int sockfd = pkt->sockfd; char *buf = pkt->buf; // In this case, we DON'T want to echo the packet back! // We just want its IP and TCP headers int n = 4*ip->ihl + 4*tcp->doff; // pkt->len; pkt_t *resp = make_packet(); resp->sockfd = sockfd; // response will go out of the same socket char *res = resp->buf; memcpy(res, buf, n); memcpy(res + n, payload, payload_sz); struct iphdr *r_ip = (struct iphdr*) res; struct tcphdr *r_tcp = (struct tcphdr*) (res + 4*ip->ihl); r_ip->daddr = ip->saddr; r_ip->saddr = ip->daddr; r_ip->tot_len += payload_sz; r_tcp->source = tcp->dest; r_tcp->dest = tcp->source; r_tcp->ack_seq = tcp->seq; r_tcp->seq = htonl( tcb->seq ); r_tcp->ack = 1; // flags | 0x02 r_tcp->check = 0; tcb->seq += payload_sz; tcb->ack = ntohl( tcp->ack_seq ); resp->seq = ntohl( r_tcp->seq ); resp->ack = resp->seq + payload_sz ; // how it will be ack-ed resp->len = n+payload_sz; //recompute the checksums r_ip->check = checksum( (uint16_t *) res, 4 * ip->ihl); r_tcp->check = tcp4_checksum (res, r_ip, r_tcp, n - 4*ip->ihl - 4*tcp->doff + payload_sz); resp->sin = pkt->sin; resp->ip = r_ip; resp->tcp = r_tcp; link_packet(resp, tcb); send_and_set_timer(resp); return resp; } // retransmit packet and reset its timer void retransmit_on_timeout( pkt_t *pkt, int i) { int sockfd = pkt->sockfd; struct sockaddr *sin = (struct sockaddr*)&(pkt->sin); fprintf(stderr, "Retransmitting pkt %d ack %X (sock %d)\n", i, pkt->ack, sockfd ); if (sendto (sockfd, pkt->buf, pkt->len, 0, sin, sizeof (struct sockaddr_in)) < 0) { perror ("sendto() failed on retransmission"); exit (1); } reset_rto_timer( pkt, i ); } pkt_t* make_and_send_fin( pkt_t *pkt, tcb_t *tcb ) { struct iphdr *ip = pkt->ip; struct tcphdr *tcp = pkt->tcp; int sockfd = pkt->sockfd; char *buf = pkt->buf; // In this case, we DON'T want to echo the packet back! // We just want its IP and TCP headers int n = 4*ip->ihl + 4*tcp->doff; // pkt->len; pkt_t *resp = make_packet(); resp->sockfd = sockfd; // response will go out of the same socket char *res = resp->buf; memcpy(res, buf, n); struct iphdr *r_ip = (struct iphdr*) res; struct tcphdr *r_tcp = (struct tcphdr*) (res + 4*ip->ihl); r_ip->daddr = ip->saddr; r_ip->saddr = ip->daddr; r_tcp->source = tcp->dest; r_tcp->dest = tcp->source; r_tcp->ack_seq = tcp->seq; r_tcp->seq = htonl( tcb->seq ); r_tcp->ack = 0; // clear ACK, we just want the FIN r_tcp->fin = 1; r_tcp->check = 0; //recompute the checksums r_ip->check = checksum( (uint16_t *) res, 4 * ip->ihl); r_tcp->check = tcp4_checksum (res, r_ip, r_tcp, n - 4*ip->ihl - 4*tcp->doff ); resp->len = n; resp->sin = pkt->sin; resp->ip = r_ip; resp->tcp = r_tcp; // link it up link_packet(resp, tcb); //send packet send_and_set_timer(resp); return resp; } // Build IPv4 TCP pseudo-header and call checksum function. // This function should really be zero-copy, but it's left as an exercise // (or look it up in a TCP implementation). uint16_t tcp4_checksum (char *packet, struct iphdr *ip, struct tcphdr *tcp, int tcp_payload_len) { char buf[IP_MAXPACKET]; char *ptr; int chksumlen; // ptr points to beginning of buffer buf ptr = &buf[0]; struct pseudo_tcp_header { in_addr_t saddr, daddr; u_char reserved; u_char protocol; u_short tcp_size; } ps_head; uint16_t header_len = 4*tcp->doff; ps_head.saddr = ip->saddr; ps_head.daddr = ip->daddr; ps_head.reserved = 0; ps_head.protocol = ip->protocol; ps_head.tcp_size = htons(header_len + tcp_payload_len); memcpy(ptr, &ps_head, sizeof(struct pseudo_tcp_header)); ptr += sizeof(struct pseudo_tcp_header); // now copy TCP header memcpy (ptr, tcp, header_len); // this must be result's TCP header! ptr += header_len; // ... and the TCP payload. memcpy (ptr, packet + ip->ihl*4 + header_len, tcp_payload_len); ptr += tcp_payload_len; chksumlen = ptr - buf; return checksum ((uint16_t *) buf, chksumlen); } // Computing the internet checksum (RFC 1071). uint16_t checksum (uint16_t *addr, int len) { int count = len; register uint32_t sum = 0; uint16_t answer = 0; // Sum up 2-byte values until none or only one byte left. while (count > 1) { sum += *(addr++); count -= 2; } // Add left-over byte, if any. if (count > 0) { sum += *(uint8_t *) addr; } // Fold 32-bit sum into 16 bits; we lose information by doing this, // increasing the chances of a collision. // sum = (lower 16 bits) + (upper 16 bits shifted right 16 bits) while (sum >> 16) { sum = (sum & 0xffff) + (sum >> 16); } // Checksum is one's compliment of sum. answer = ~sum; return (answer); } void print_packet_summary(FILE *s, pkt_t *pkt) { fprintf(s, " %s:%d -> ", ADDR2A(pkt->ip->saddr), ntohs(pkt->tcp->source) ); fprintf(s, "%s:%d ", ADDR2A(pkt->ip->daddr), ntohs(pkt->tcp->dest) ); fprintf(s, "ack %X\n", pkt->ack ); } void print_packet_info(FILE *s, pkt_t *pkt) { int i; char *buf = pkt->buf; int n = pkt->len; // struct iphdr *ip = pkt->ip; struct tcphdr *tcp = pkt->tcp; // printf("IP header is %d bytes.\n", ip->ihl*4); for (i = 0; i < n; i++) { fprintf(s, "%02X%s", (uint8_t)buf[i], (i + 1)%16 ? " " : "\n"); } fprintf(s, "\n"); // Make like a tiny tcpdump. Note that this ignores other flags & combinations! fprintf(s, "TCP sport=%d, dport=%d ", ntohs(tcp->source), ntohs(tcp->dest)); if (tcp->syn && ! tcp->ack) fprintf(s, "SYN"); else if( tcp->syn && tcp->ack ) fprintf(s, "SYN+ACK"); else if( tcp->ack && !tcp->fin ) fprintf(s, "ACK"); else if( tcp->ack && tcp->fin ) fprintf(s, "FIN+ACK"); // and then what? No cases should fall through! fprintf(s, "\n"); } int tcp_raw_socket() { const int on = 1; int sockfd; // create a raw socket for getting and sending packets sockfd = socket(AF_INET, SOCK_RAW, IPPROTO_TCP); if (sockfd < 0){ perror("sock:"); exit(1); } // Set flag so socket expects us to provide IPv4 header. if (setsockopt (sockfd, IPPROTO_IP, IP_HDRINCL, &on, sizeof (on)) < 0) { perror ("setsockopt() failed to set IP_HDRINCL "); exit (1); } return sockfd; } // listens on global polled[], takes action on expired timers void recv_packet(pkt_t *pkt) { int n, rv, i; socklen_t clilen = sizeof(struct sockaddr_in); int sockfd = polled[0].fd; char *buf = pkt->buf; pkt->invalid = 0; printf(" at poll\n"); rv = poll( polled, num_polled, 10000 ); if (rv == -1) { perror("poll:"); // error occurred in poll() } else if (rv == 0) { fprintf(stderr, "Timeout occurred! No data after 10 seconds.\n"); pkt->invalid = 1; return; } else { // We got a packet if (polled[0].revents & POLLIN) { n = recvfrom(sockfd, buf, IP_MAXPACKET, 0, (struct sockaddr *)&(pkt->sin), &clilen); if( n < 0 ){ perror("recv: "); exit(1); } else if( n == 0 ){ fprintf( stderr, "Raw socket recv returned 0; this should not happen\n"); exit(1); } else printf(" rec'd %d bytes\n", n); pkt->sockfd = sockfd; pkt->len = n; pkt->clilen = clilen; struct iphdr *ip_hdr = (struct iphdr *) buf; struct tcphdr *tcp_hdr = (struct tcphdr *)((char *)ip_hdr + (4 * ip_hdr->ihl)); pkt->ip = ip_hdr; pkt->tcp = tcp_hdr; // print whatever info helps to debug print_packet_info(stdout, pkt); } // done with a packet, now handle timers for( i=1; i < num_polled; i++ ){ // traverse timers in polled[] if(polled[i].revents & POLLIN) { uint64_t exp; // must read the expired timer int fd = polled[i].fd; int s = read(fd, &exp, sizeof(uint64_t)); if (s != sizeof(uint64_t)){ perror("timer read: "); exit(1); } fprintf(stderr, "Timeout of TRO %d: %lu\n", i, exp); // restransmit timed-out packet retransmit_on_timeout( timer2pkt[fd], i); } } // done with timers } // end poll } int init_payload( const char* filename ) { int pd = open( filename, 0, O_RDONLY ); if( pd < 0 ){ perror("payload open: "); exit(1); } return pd; } // returns read length, 0 for end-of-file, or bust int get_payload( char *payload, int *payload_sz, size_t sz, tcb_t *tcb ) { int n; n = read( tcb->payload_fd, payload, sz); if( n < 0 ){ perror("payload read: "); exit(1); } *payload_sz = n; return n; } // set timer for the packet; return timer fd int set_rto_timer(pkt_t *pkt) { struct itimerspec itspec; int tfd = timerfd_create(CLOCK_MONOTONIC, 0); if (tfd == -1){ perror("timerfd_create: "); exit(1); } itspec.it_value.tv_sec = 1; // expire in 1 sec itspec.it_value.tv_nsec = 0; itspec.it_interval.tv_sec = 0; // ...just once itspec.it_interval.tv_nsec = 0; if (timerfd_settime(tfd, 0, &itspec, NULL) == -1){ perror("timerfd_settime: "); exit(1); } // now activate in polled polled[tfd].fd = tfd; polled[tfd].events = POLLIN; num_polled = (num_polled < tfd+1) ? tfd+1 : num_polled; // now tfd is polled pkt->rto_fd = tfd; pkt->polled_slot = tfd; fprintf(stderr, "Created timer %d for packet ", tfd ); print_packet_summary(stderr, pkt); return tfd; } // reset as timer i in polled void reset_rto_timer(pkt_t *pkt, int i) { struct itimerspec itspec; int tfd = pkt->rto_fd; itspec.it_value.tv_sec = 1; // expire in 1 sec itspec.it_value.tv_nsec = 0; itspec.it_interval.tv_sec = 0; // ...just once itspec.it_interval.tv_nsec = 0; if (timerfd_settime(tfd, 0, &itspec, NULL) == -1){ perror("reset RTO timerfd_settime: "); exit(1); } } // Clear the packet's timer in polled and free the packet // Note: must the the last thing called in a linked list traversal; // doing "p = p->next;" after feeing p will NOT work well. void clear_rto_timer(pkt_t *pkt) { int tfd = pkt->rto_fd; // int i = pkt->polled_slot; fprintf(stderr, "Clearing timer fd %d\n", tfd); if( polled[tfd].fd < 0 ){ fprintf(stderr, "WARN: timer %d already cleared!\n", tfd); return; } // de-activate in polled polled[tfd].fd = -tfd; //deallocate the packet free(timer2pkt[tfd]); timer2pkt[tfd] = NULL; close(tfd); } // daddr and dport must match, saddr and sport must match if set // (these are set when sending the SYN+ACK) int matches_tcb(pkt_t* pkt, tcb_t* tcb) { if( tcb->saddr != 0 && tcb->saddr != ntohl(pkt->ip->saddr) ) return 0; if( tcb->sport != 0 && tcb->sport != ntohs(pkt->tcp->source) ) return 0; if( tcb->daddr != ntohl(pkt->ip->daddr) ) return 0; if( tcb->dport != ntohs(pkt->tcp->dest) ) return 0; return 1; } pkt_t* make_packet() { pkt_t *pkt = malloc(sizeof(pkt_t)); if( pkt == NULL ){ perror("malloc pkt_t:"); exit(1); } return pkt; } // Parse the RFC 1323 window scale option out of SYN packet, fill in the value. // According to RFC 1323, this option can only appear on the SYN packet. // Return 1 on success, zero on failure. int get_window_scale( pkt_t *pkt, int *window ) { uint8_t *opt = (uint8_t*)pkt->tcp + sizeof(struct tcphdr); int found = 0; while( *opt != 0 ) { tcp_option_t* _opt = (tcp_option_t*)opt; if( _opt->kind == TCPOPT_NOP ) { ++opt; // NOP is one byte, this is a special case. Other options have explicit length. continue; } if( _opt->kind == TCPOPT_WINDOW ) { *window = *(opt + 2); found = 1; break; } opt += _opt->size; } return found; } // Update window in TCB void update_window(pkt_t *pkt, tcb_t *tcb) { int window_scale; int old_window = tcb->window; if( IS_SYN(pkt) ){ if( get_window_scale(pkt, &window_scale) ){ tcb->window_scale = window_scale; tcb->window = ntohs(pkt->tcp->window) << window_scale; fprintf(stderr, "Got window scaling factor %d, window size %d\n", window_scale, tcb->window ); } else{ tcb->window_scale = 0; tcb->window = ntohs(pkt->tcp->window); fprintf(stderr, "No window scaling opting in SYN, window size %d\n", tcb->window ); } } else { // presumes this is an ACK if( tcb->window_scale > 0 ) tcb->window = ntohs(pkt->tcp->window) << tcb->window_scale; else tcb->window = ntohs(pkt->tcp->window); } if( old_window != tcb->window ) fprintf(stderr, "WINDOW: got new window %d (tcp.window %d)\n", tcb->window, ntohs(pkt->tcp->window) ); }